Merge remote-tracking branch 'remotes/origin/master' into newdirstructure

author: vagrant <vagrant@localhost.localdomain> 2019-02-28 15:00:39 +0000
committer: vagrant <vagrant@localhost.localdomain> 2019-02-28 15:00:39 +0000
commit: 364a703de9f31b35d4301f3e913f519be9d3a14f (patch)
tree: a398909ff87b22745829657f3e62b0439a64ad77
parent: 7bb99cfd904b23c041be273ffc2746296e6eb814 (diff)
parent: 4c728cf72345f7ab7967380cb536529fd9b1403d (diff)
download: regularization-364a703de9f31b35d4301f3e913f519be9d3a14f.tar.gz
regularization-364a703de9f31b35d4301f3e913f519be9d3a14f.tar.bz2
regularization-364a703de9f31b35d4301f3e913f519be9d3a14f.tar.xz
regularization-364a703de9f31b35d4301f3e913f519be9d3a14f.zip
11 files changed, 1158 insertions, 279 deletions
diff --git a/Wrappers/Python/demos/SoftwareX_supp/Demo_RealData_Recon_SX.py b/Wrappers/Python/demos/SoftwareX_supp/Demo_RealData_Recon_SX.py
new file mode 100644
index 0000000..01491d9
--- /dev/null
+++ b/Wrappers/Python/demos/SoftwareX_supp/Demo_RealData_Recon_SX.py
@@ -0,0 +1,231 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+This demo scripts support the following publication: 
+"CCPi-Regularisation Toolkit for computed tomographic image reconstruction with 
+proximal splitting algorithms" by Daniil Kazantsev, Edoardo Pasca, Martin J. Turner,
+ Philip J. Withers; Software X, 2019
+____________________________________________________________________________
+* Reads real tomographic data (stored at Zenodo)
+--- https://doi.org/10.5281/zenodo.2578893
+* Reconstructs using TomoRec software
+* Saves reconstructed images 
+____________________________________________________________________________
+>>>>> Dependencies: <<<<<
+1. ASTRA toolbox: conda install -c astra-toolbox astra-toolbox
+2. TomoRec: conda install -c dkazanc tomorec
+or install from https://github.com/dkazanc/TomoRec
+3. libtiff if one needs to save tiff images:
+    install pip install libtiff
+
+@author: Daniil Kazantsev, e:mail daniil.kazantsev@diamond.ac.uk
+GPLv3 license (ASTRA toolbox)
+"""
+import numpy as np
+import matplotlib.pyplot as plt
+import h5py
+from tomorec.supp.suppTools import normaliser
+import time
+
+# load dendritic projection data
+h5f = h5py.File('data/DendrData_3D.h5','r')
+dataRaw = h5f['dataRaw'][:]
+flats = h5f['flats'][:]
+darks = h5f['darks'][:]
+angles_rad = h5f['angles_rad'][:]
+h5f.close()
+#%%
+# normalise the data [detectorsVert, Projections, detectorsHoriz]
+data_norm = normaliser(dataRaw, flats, darks, log='log')
+del dataRaw, darks, flats
+
+intens_max = 2.3
+plt.figure() 
+plt.subplot(131)
+plt.imshow(data_norm[:,150,:],vmin=0, vmax=intens_max)
+plt.title('2D Projection (analytical)')
+plt.subplot(132)
+plt.imshow(data_norm[300,:,:],vmin=0, vmax=intens_max)
+plt.title('Sinogram view')
+plt.subplot(133)
+plt.imshow(data_norm[:,:,600],vmin=0, vmax=intens_max)
+plt.title('Tangentogram view')
+plt.show()
+
+detectorHoriz = np.size(data_norm,2)
+det_y_crop = [i for i in range(0,detectorHoriz-22)]
+N_size = 950 # reconstruction domain
+time_label = int(time.time())
+#%%
+print ("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
+print ("%%%%%%%%%%%%Reconstructing with FBP method %%%%%%%%%%%%%%%%%")
+print ("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
+from tomorec.methodsDIR import RecToolsDIR
+
+RectoolsDIR = RecToolsDIR(DetectorsDimH = np.size(det_y_crop),  # DetectorsDimH # detector dimension (horizontal)
+                    DetectorsDimV = 100,  # DetectorsDimV # detector dimension (vertical) for 3D case only
+                    AnglesVec = angles_rad, # array of angles in radians
+                    ObjSize = N_size, # a scalar to define reconstructed object dimensions
+                    device='gpu')
+
+FBPrec = RectoolsDIR.FBP(data_norm[0:100,:,det_y_crop])
+
+sliceSel = 50
+max_val = 0.003
+plt.figure() 
+plt.subplot(131)
+plt.imshow(FBPrec[sliceSel,:,:],vmin=0, vmax=max_val, cmap="gray")
+plt.title('FBP Reconstruction, axial view')
+
+plt.subplot(132)
+plt.imshow(FBPrec[:,sliceSel,:],vmin=0, vmax=max_val, cmap="gray")
+plt.title('FBP Reconstruction, coronal view')
+
+plt.subplot(133)
+plt.imshow(FBPrec[:,:,sliceSel],vmin=0, vmax=max_val, cmap="gray")
+plt.title('FBP Reconstruction, sagittal view')
+plt.show()
+
+# saving to tiffs (16bit)
+"""
+from libtiff import TIFF
+FBPrec += np.abs(np.min(FBPrec))
+multiplier = (int)(65535/(np.max(FBPrec)))
+
+# saving to tiffs (16bit)
+for i in range(0,np.size(FBPrec,0)):
+    tiff = TIFF.open('Dendr_FBP'+'_'+str(i)+'.tiff', mode='w')
+    tiff.write_image(np.uint16(FBPrec[i,:,:]*multiplier))
+    tiff.close()
+"""
+#%%
+print ("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
+print ("Reconstructing with ADMM method using TomoRec software")
+print ("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
+# initialise TomoRec ITERATIVE reconstruction class ONCE
+from tomorec.methodsIR import RecToolsIR
+RectoolsIR = RecToolsIR(DetectorsDimH =  np.size(det_y_crop),  # DetectorsDimH # detector dimension (horizontal)
+                    DetectorsDimV = 100,  # DetectorsDimV # detector dimension (vertical) for 3D case only
+                    AnglesVec = angles_rad, # array of angles in radians
+                    ObjSize = N_size, # a scalar to define reconstructed object dimensions
+                    datafidelity='LS',# data fidelity, choose LS, PWLS (wip), GH (wip), Student (wip)
+                    nonnegativity='ENABLE', # enable nonnegativity constraint (set to 'ENABLE')
+                    OS_number = None, # the number of subsets, NONE/(or > 1) ~ classical / ordered subsets
+                    tolerance = 1e-08, # tolerance to stop outer iterations earlier
+                    device='gpu')
+#%%
+print ("Reconstructing with ADMM method using SB-TV penalty")
+RecADMM_reg_sbtv = RectoolsIR.ADMM(data_norm[0:100,:,det_y_crop],
+                              rho_const = 2000.0, \
+                              iterationsADMM = 15, \
+                              regularisation = 'SB_TV', \
+                              regularisation_parameter = 0.00085,\
+                              regularisation_iterations = 50)
+
+sliceSel = 50
+max_val = 0.003
+plt.figure() 
+plt.subplot(131)
+plt.imshow(RecADMM_reg_sbtv[sliceSel,:,:],vmin=0, vmax=max_val, cmap="gray")
+plt.title('3D ADMM-SB-TV Reconstruction, axial view')
+
+plt.subplot(132)
+plt.imshow(RecADMM_reg_sbtv[:,sliceSel,:],vmin=0, vmax=max_val, cmap="gray")
+plt.title('3D ADMM-SB-TV Reconstruction, coronal view')
+
+plt.subplot(133)
+plt.imshow(RecADMM_reg_sbtv[:,:,sliceSel],vmin=0, vmax=max_val, cmap="gray")
+plt.title('3D ADMM-SB-TV Reconstruction, sagittal view')
+plt.show()
+
+
+# saving to tiffs (16bit)
+"""
+from libtiff import TIFF
+multiplier = (int)(65535/(np.max(RecADMM_reg_sbtv)))
+for i in range(0,np.size(RecADMM_reg_sbtv,0)):
+    tiff = TIFF.open('Dendr_ADMM_SBTV'+'_'+str(i)+'.tiff', mode='w')
+    tiff.write_image(np.uint16(RecADMM_reg_sbtv[i,:,:]*multiplier))
+    tiff.close()
+"""
+# Saving recpnstructed data with a unique time label
+np.save('Dendr_ADMM_SBTV'+str(time_label)+'.npy', RecADMM_reg_sbtv)
+del RecADMM_reg_sbtv
+#%%
+print ("Reconstructing with ADMM method using ROF-LLT penalty")
+RecADMM_reg_rofllt = RectoolsIR.ADMM(data_norm[0:100,:,det_y_crop],
+                              rho_const = 2000.0, \
+                              iterationsADMM = 15, \
+                              regularisation = 'LLT_ROF', \
+                              regularisation_parameter = 0.0009,\
+                              regularisation_parameter2 = 0.0007,\
+                              time_marching_parameter = 0.001,\
+                              regularisation_iterations = 550)
+
+sliceSel = 50
+max_val = 0.003
+plt.figure() 
+plt.subplot(131)
+plt.imshow(RecADMM_reg_rofllt[sliceSel,:,:],vmin=0, vmax=max_val)
+plt.title('3D ADMM-ROFLLT Reconstruction, axial view')
+
+plt.subplot(132)
+plt.imshow(RecADMM_reg_rofllt[:,sliceSel,:],vmin=0, vmax=max_val)
+plt.title('3D ADMM-ROFLLT Reconstruction, coronal view')
+
+plt.subplot(133)
+plt.imshow(RecADMM_reg_rofllt[:,:,sliceSel],vmin=0, vmax=max_val)
+plt.title('3D ADMM-ROFLLT Reconstruction, sagittal view')
+plt.show()
+
+# saving to tiffs (16bit)
+"""
+from libtiff import TIFF
+multiplier = (int)(65535/(np.max(RecADMM_reg_rofllt)))
+for i in range(0,np.size(RecADMM_reg_rofllt,0)):
+    tiff = TIFF.open('Dendr_ADMM_ROFLLT'+'_'+str(i)+'.tiff', mode='w')
+    tiff.write_image(np.uint16(RecADMM_reg_rofllt[i,:,:]*multiplier))
+    tiff.close()
+"""
+
+# Saving recpnstructed data with a unique time label
+np.save('Dendr_ADMM_ROFLLT'+str(time_label)+'.npy', RecADMM_reg_rofllt)
+del RecADMM_reg_rofllt
+#%%
+print ("Reconstructing with ADMM method using TGV penalty")
+RecADMM_reg_tgv = RectoolsIR.ADMM(data_norm[0:100,:,det_y_crop],
+                              rho_const = 2000.0, \
+                              iterationsADMM = 15, \
+                              regularisation = 'TGV', \
+                              regularisation_parameter = 0.01,\
+                              regularisation_iterations = 500)
+
+sliceSel = 50
+max_val = 0.003
+plt.figure() 
+plt.subplot(131)
+plt.imshow(RecADMM_reg_tgv[sliceSel,:,:],vmin=0, vmax=max_val)
+plt.title('3D ADMM-TGV Reconstruction, axial view')
+
+plt.subplot(132)
+plt.imshow(RecADMM_reg_tgv[:,sliceSel,:],vmin=0, vmax=max_val)
+plt.title('3D ADMM-TGV Reconstruction, coronal view')
+
+plt.subplot(133)
+plt.imshow(RecADMM_reg_tgv[:,:,sliceSel],vmin=0, vmax=max_val)
+plt.title('3D ADMM-TGV Reconstruction, sagittal view')
+plt.show()
+
+# saving to tiffs (16bit)
+"""
+from libtiff import TIFF
+multiplier = (int)(65535/(np.max(RecADMM_reg_tgv)))
+for i in range(0,np.size(RecADMM_reg_tgv,0)):
+    tiff = TIFF.open('Dendr_ADMM_TGV'+'_'+str(i)+'.tiff', mode='w')
+    tiff.write_image(np.uint16(RecADMM_reg_tgv[i,:,:]*multiplier))
+    tiff.close()
+"""
+# Saving recpnstructed data with a unique time label
+np.save('Dendr_ADMM_TGV'+str(time_label)+'.npy', RecADMM_reg_tgv)
+del RecADMM_reg_tgv
+#%%
+\ No newline at end of file
diff --git a/Wrappers/Python/demos/SoftwareX_supp/Demo_SimulData_ParOptimis_SX.py b/Wrappers/Python/demos/SoftwareX_supp/Demo_SimulData_ParOptimis_SX.py
new file mode 100644
index 0000000..59ffc0e
--- /dev/null
+++ b/Wrappers/Python/demos/SoftwareX_supp/Demo_SimulData_ParOptimis_SX.py
@@ -0,0 +1,161 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+This demo scripts support the following publication: 
+"CCPi-Regularisation Toolkit for computed tomographic image reconstruction with 
+proximal splitting algorithms" by Daniil Kazantsev, Edoardo Pasca, Martin J. Turner,
+ Philip J. Withers; Software X, 2019
+____________________________________________________________________________
+* Reads data which is previosly generated by TomoPhantom software (Zenodo link)
+--- https://doi.org/10.5281/zenodo.2578893
+* Optimises for the regularisation parameters which later used in the script:
+Demo_SimulData_Recon_SX.py
+____________________________________________________________________________
+>>>>> Dependencies: <<<<<
+>>>>> Dependencies: <<<<<
+1. ASTRA toolbox: conda install -c astra-toolbox astra-toolbox
+2. TomoRec: conda install -c dkazanc tomorec
+or install from https://github.com/dkazanc/TomoRec
+
+@author: Daniil Kazantsev, e:mail daniil.kazantsev@diamond.ac.uk
+GPLv3 license (ASTRA toolbox)
+"""
+#import timeit
+import matplotlib.pyplot as plt
+import numpy as np
+import h5py
+from ccpi.supp.qualitymetrics import QualityTools
+
+# loading the data 
+h5f = h5py.File('data/TomoSim_data1550671417.h5','r')
+phantom = h5f['phantom'][:]
+projdata_norm = h5f['projdata_norm'][:]
+proj_angles = h5f['proj_angles'][:]
+h5f.close()
+
+[Vert_det, AnglesNum, Horiz_det] = np.shape(projdata_norm)
+N_size = Vert_det
+
+sliceSel = 128
+#plt.gray()
+plt.figure() 
+plt.subplot(131)
+plt.imshow(phantom[sliceSel,:,:],vmin=0, vmax=1)
+plt.title('3D Phantom, axial view')
+
+plt.subplot(132)
+plt.imshow(phantom[:,sliceSel,:],vmin=0, vmax=1)
+plt.title('3D Phantom, coronal view')
+
+plt.subplot(133)
+plt.imshow(phantom[:,:,sliceSel],vmin=0, vmax=1)
+plt.title('3D Phantom, sagittal view')
+plt.show()
+
+intens_max = 240
+plt.figure() 
+plt.subplot(131)
+plt.imshow(projdata_norm[:,sliceSel,:],vmin=0, vmax=intens_max)
+plt.title('2D Projection (erroneous)')
+plt.subplot(132)
+plt.imshow(projdata_norm[sliceSel,:,:],vmin=0, vmax=intens_max)
+plt.title('Sinogram view')
+plt.subplot(133)
+plt.imshow(projdata_norm[:,:,sliceSel],vmin=0, vmax=intens_max)
+plt.title('Tangentogram view')
+plt.show()
+#%%
+print ("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
+print ("Reconstructing with ADMM method using TomoRec software")
+print ("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
+# initialise TomoRec ITERATIVE reconstruction class ONCE
+from tomorec.methodsIR import RecToolsIR
+RectoolsIR = RecToolsIR(DetectorsDimH = Horiz_det,  # DetectorsDimH # detector dimension (horizontal)
+                    DetectorsDimV = Vert_det,  # DetectorsDimV # detector dimension (vertical) for 3D case only
+                    AnglesVec = proj_angles, # array of angles in radians
+                    ObjSize = N_size, # a scalar to define reconstructed object dimensions
+                    datafidelity='LS',# data fidelity, choose LS, PWLS (wip), GH (wip), Student (wip)
+                    nonnegativity='ENABLE', # enable nonnegativity constraint (set to 'ENABLE')
+                    OS_number = None, # the number of subsets, NONE/(or > 1) ~ classical / ordered subsets
+                    tolerance = 1e-08, # tolerance to stop outer iterations earlier
+                    device='gpu')
+#%%
+param_space = 30
+reg_param_sb_vec = np.linspace(0.03,0.15,param_space,dtype='float32') # a vector of parameters
+erros_vec_sbtv = np.zeros((param_space)) # a vector of errors
+
+print ("Reconstructing with ADMM method using SB-TV penalty")
+for i in range(0,param_space):
+    RecADMM_reg_sbtv = RectoolsIR.ADMM(projdata_norm,
+                                  rho_const = 2000.0, \
+                                  iterationsADMM = 15, \
+                                  regularisation = 'SB_TV', \
+                                  regularisation_parameter = reg_param_sb_vec[i],\
+                                  regularisation_iterations = 50)
+    # calculate errors 
+    Qtools = QualityTools(phantom, RecADMM_reg_sbtv)
+    erros_vec_sbtv[i] = Qtools.rmse()
+    print("RMSE for regularisation parameter {} for ADMM-SB-TV is {}".format(reg_param_sb_vec[i],erros_vec_sbtv[i]))
+
+plt.figure() 
+plt.plot(erros_vec_sbtv)
+
+# Saving generated data with a unique time label
+h5f = h5py.File('Optim_admm_sbtv.h5', 'w')
+h5f.create_dataset('reg_param_sb_vec', data=reg_param_sb_vec)
+h5f.create_dataset('erros_vec_sbtv', data=erros_vec_sbtv)
+h5f.close()
+#%%
+param_space = 30
+reg_param_rofllt_vec = np.linspace(0.03,0.15,param_space,dtype='float32') # a vector of parameters
+erros_vec_rofllt = np.zeros((param_space)) # a vector of errors
+
+print ("Reconstructing with ADMM method using ROF-LLT penalty")
+for i in range(0,param_space):
+    RecADMM_reg_rofllt = RectoolsIR.ADMM(projdata_norm,
+                                  rho_const = 2000.0, \
+                                  iterationsADMM = 15, \
+                                  regularisation = 'LLT_ROF', \
+                                  regularisation_parameter = reg_param_rofllt_vec[i],\
+                                  regularisation_parameter2 = 0.005,\
+                                  regularisation_iterations = 600)
+    # calculate errors 
+    Qtools = QualityTools(phantom, RecADMM_reg_rofllt)
+    erros_vec_rofllt[i] = Qtools.rmse()
+    print("RMSE for regularisation parameter {} for ADMM-ROF-LLT is {}".format(reg_param_rofllt_vec[i],erros_vec_rofllt[i]))
+
+plt.figure() 
+plt.plot(erros_vec_rofllt)
+
+# Saving generated data with a unique time label
+h5f = h5py.File('Optim_admm_rofllt.h5', 'w')
+h5f.create_dataset('reg_param_rofllt_vec', data=reg_param_rofllt_vec)
+h5f.create_dataset('erros_vec_rofllt', data=erros_vec_rofllt)
+h5f.close()
+#%%
+param_space = 30
+reg_param_tgv_vec = np.linspace(0.03,0.15,param_space,dtype='float32') # a vector of parameters
+erros_vec_tgv = np.zeros((param_space)) # a vector of errors
+
+print ("Reconstructing with ADMM method using TGV penalty")
+for i in range(0,param_space):
+    RecADMM_reg_tgv = RectoolsIR.ADMM(projdata_norm,
+                                  rho_const = 2000.0, \
+                                  iterationsADMM = 15, \
+                                  regularisation = 'TGV', \
+                                  regularisation_parameter = reg_param_tgv_vec[i],\
+                                  regularisation_iterations = 600)
+    # calculate errors 
+    Qtools = QualityTools(phantom, RecADMM_reg_tgv)
+    erros_vec_tgv[i] = Qtools.rmse()
+    print("RMSE for regularisation parameter {} for ADMM-TGV is {}".format(reg_param_tgv_vec[i],erros_vec_tgv[i]))
+
+plt.figure() 
+plt.plot(erros_vec_tgv)
+
+# Saving generated data with a unique time label
+h5f = h5py.File('Optim_admm_tgv.h5', 'w')
+h5f.create_dataset('reg_param_tgv_vec', data=reg_param_tgv_vec)
+h5f.create_dataset('erros_vec_tgv', data=erros_vec_tgv)
+h5f.close()
+#%%
+\ No newline at end of file
diff --git a/Wrappers/Python/demos/SoftwareX_supp/Demo_SimulData_Recon_SX.py b/Wrappers/Python/demos/SoftwareX_supp/Demo_SimulData_Recon_SX.py
new file mode 100644
index 0000000..93b0cef
--- /dev/null
+++ b/Wrappers/Python/demos/SoftwareX_supp/Demo_SimulData_Recon_SX.py
@@ -0,0 +1,309 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+This demo scripts support the following publication: 
+"CCPi-Regularisation Toolkit for computed tomographic image reconstruction with 
+proximal splitting algorithms" by Daniil Kazantsev, Edoardo Pasca, Martin J. Turner,
+ Philip J. Withers; Software X, 2019
+____________________________________________________________________________
+* Reads data which is previously generated by TomoPhantom software (Zenodo link)
+--- https://doi.org/10.5281/zenodo.2578893
+* Reconstruct using optimised regularisation parameters (see Demo_SimulData_ParOptimis_SX.py)
+____________________________________________________________________________
+>>>>> Dependencies: <<<<<
+1. ASTRA toolbox: conda install -c astra-toolbox astra-toolbox
+2. TomoRec: conda install -c dkazanc tomorec
+or install from https://github.com/dkazanc/TomoRec
+
+@author: Daniil Kazantsev, e:mail daniil.kazantsev@diamond.ac.uk
+GPLv3 license (ASTRA toolbox)
+"""
+#import timeit
+import matplotlib.pyplot as plt
+import matplotlib.gridspec as gridspec
+import numpy as np
+import h5py
+from ccpi.supp.qualitymetrics import QualityTools
+from scipy.signal import gaussian
+
+# loading the data 
+h5f = h5py.File('data/TomoSim_data1550671417.h5','r')
+phantom = h5f['phantom'][:]
+projdata_norm = h5f['projdata_norm'][:]
+proj_angles = h5f['proj_angles'][:]
+h5f.close()
+
+[Vert_det, AnglesNum, Horiz_det] = np.shape(projdata_norm)
+N_size = Vert_det
+
+# loading optmisation parameters (the result of running Demo_SimulData_ParOptimis_SX)
+h5f = h5py.File('optim_param/Optim_admm_sbtv.h5','r')
+reg_param_sb_vec = h5f['reg_param_sb_vec'][:]
+erros_vec_sbtv = h5f['erros_vec_sbtv'][:]
+h5f.close()
+
+h5f = h5py.File('optim_param/Optim_admm_rofllt.h5','r')
+reg_param_rofllt_vec = h5f['reg_param_rofllt_vec'][:]
+erros_vec_rofllt = h5f['erros_vec_rofllt'][:]
+h5f.close()
+
+h5f = h5py.File('optim_param/Optim_admm_tgv.h5','r')
+reg_param_tgv_vec = h5f['reg_param_tgv_vec'][:]
+erros_vec_tgv = h5f['erros_vec_tgv'][:]
+h5f.close()
+
+index_minSBTV = min(xrange(len(erros_vec_sbtv)), key=erros_vec_sbtv.__getitem__)
+index_minROFLLT = min(xrange(len(erros_vec_rofllt)), key=erros_vec_rofllt.__getitem__)
+index_minTGV = min(xrange(len(erros_vec_tgv)), key=erros_vec_tgv.__getitem__)
+# assign optimal regularisation parameters:
+optimReg_sbtv = reg_param_sb_vec[index_minSBTV]
+optimReg_rofllt = reg_param_rofllt_vec[index_minROFLLT]
+optimReg_tgv = reg_param_tgv_vec[index_minTGV]
+#%%
+# plot loaded data
+sliceSel = 128
+#plt.figure() 
+fig, (ax1, ax2) = plt.subplots(figsize=(15, 5), ncols=2)
+plt.rcParams.update({'xtick.labelsize': 'x-small'})
+plt.rcParams.update({'ytick.labelsize':'x-small'})
+plt.subplot(121)
+one = plt.imshow(phantom[sliceSel,:,:],vmin=0, vmax=1, interpolation='none', cmap="PuOr")
+fig.colorbar(one, ax=ax1)
+plt.title('3D Phantom, axial (X-Y) view')
+plt.subplot(122)
+two = plt.imshow(phantom[:,sliceSel,:],vmin=0, vmax=1,interpolation='none', cmap="PuOr")
+fig.colorbar(two, ax=ax2)
+plt.title('3D Phantom, coronal (Y-Z) view')
+"""
+plt.subplot(133)
+plt.imshow(phantom[:,:,sliceSel],vmin=0, vmax=1, cmap="PuOr")
+plt.title('3D Phantom, sagittal view')
+
+"""
+plt.show()
+#%%
+intens_max = 220
+plt.figure() 
+plt.rcParams.update({'xtick.labelsize': 'x-small'})
+plt.rcParams.update({'ytick.labelsize':'x-small'})
+plt.subplot(131)
+plt.imshow(projdata_norm[:,sliceSel,:],vmin=0, vmax=intens_max, cmap="PuOr")
+plt.xlabel('X-detector', fontsize=16)
+plt.ylabel('Z-detector', fontsize=16)
+plt.title('2D Projection (X-Z) view', fontsize=19)
+plt.subplot(132)
+plt.imshow(projdata_norm[sliceSel,:,:],vmin=0, vmax=intens_max, cmap="PuOr")
+plt.xlabel('X-detector', fontsize=16)
+plt.ylabel('Projection angle', fontsize=16)
+plt.title('Sinogram (X-Y) view', fontsize=19)
+plt.subplot(133)
+plt.imshow(projdata_norm[:,:,sliceSel],vmin=0, vmax=intens_max, cmap="PuOr")
+plt.xlabel('Projection angle', fontsize=16)
+plt.ylabel('Z-detector', fontsize=16)
+plt.title('Vertical (Y-Z) view', fontsize=19)
+plt.show()
+#plt.savefig('projdata.pdf', format='pdf', dpi=1200)
+#%%
+# initialise TomoRec DIRECT reconstruction class ONCE
+from tomorec.methodsDIR import RecToolsDIR
+RectoolsDIR = RecToolsDIR(DetectorsDimH = Horiz_det,  # DetectorsDimH # detector dimension (horizontal)
+                    DetectorsDimV = Vert_det,  # DetectorsDimV # detector dimension (vertical) for 3D case only
+                    AnglesVec = proj_angles, # array of angles in radians
+                    ObjSize = N_size, # a scalar to define reconstructed object dimensions
+                    device = 'gpu')
+#%%
+print ("Reconstruction using FBP from TomoRec")
+recFBP= RectoolsDIR.FBP(projdata_norm) # FBP reconstruction
+#%%
+x0, y0 = 0, 127 # These are in _pixel_ coordinates!!
+x1, y1 = 255, 127
+
+sliceSel = int(0.5*N_size)
+max_val = 1
+plt.figure(figsize = (20,5))
+gs1 = gridspec.GridSpec(1, 3)
+gs1.update(wspace=0.1, hspace=0.05) # set the spacing between axes. 
+ax1 = plt.subplot(gs1[0])
+plt.imshow(recFBP[sliceSel,:,:],vmin=0, vmax=max_val, cmap="PuOr")
+ax1.plot([x0, x1], [y0, y1], 'ko-', linestyle='--')
+plt.colorbar(ax=ax1)
+plt.title('FBP Reconstruction, axial (X-Y) view', fontsize=19)
+ax1.set_aspect('equal')
+ax3 = plt.subplot(gs1[1])
+plt.plot(phantom[sliceSel,sliceSel,0:N_size],color='k',linewidth=2)
+plt.plot(recFBP[sliceSel,sliceSel,0:N_size],linestyle='--',color='g')
+plt.title('Profile', fontsize=19)
+ax2 = plt.subplot(gs1[2])
+plt.imshow(recFBP[:,sliceSel,:],vmin=0, vmax=max_val, cmap="PuOr")
+plt.title('FBP Reconstruction, coronal (Y-Z) view', fontsize=19)
+ax2.set_aspect('equal')
+plt.show()
+#plt.savefig('FBP_phantom.pdf', format='pdf', dpi=1600)
+
+# calculate errors 
+Qtools = QualityTools(phantom, recFBP)
+RMSE_fbp = Qtools.rmse()
+print("Root Mean Square Error for FBP is {}".format(RMSE_fbp))
+
+# SSIM measure
+Qtools = QualityTools(phantom[128,:,:]*255, recFBP[128,:,:]*235)
+win = np.array([gaussian(11, 1.5)])
+win2d = win * (win.T)
+ssim_fbp = Qtools.ssim(win2d)
+print("Mean SSIM for FBP is {}".format(ssim_fbp[0]))
+#%%
+print ("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
+print ("Reconstructing with ADMM method using TomoRec software")
+print ("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
+# initialise TomoRec ITERATIVE reconstruction class ONCE
+from tomorec.methodsIR import RecToolsIR
+RectoolsIR = RecToolsIR(DetectorsDimH = Horiz_det,  # DetectorsDimH # detector dimension (horizontal)
+                    DetectorsDimV = Vert_det,  # DetectorsDimV # detector dimension (vertical) for 3D case only
+                    AnglesVec = proj_angles, # array of angles in radians
+                    ObjSize = N_size, # a scalar to define reconstructed object dimensions
+                    datafidelity='LS',# data fidelity, choose LS, PWLS (wip), GH (wip), Student (wip)
+                    nonnegativity='ENABLE', # enable nonnegativity constraint (set to 'ENABLE')
+                    OS_number = None, # the number of subsets, NONE/(or > 1) ~ classical / ordered subsets
+                    tolerance = 1e-08, # tolerance to stop outer iterations earlier
+                    device='gpu')
+#%%
+print ("Reconstructing with ADMM method using SB-TV penalty")
+RecADMM_reg_sbtv = RectoolsIR.ADMM(projdata_norm,
+                                  rho_const = 2000.0, \
+                                  iterationsADMM = 25, \
+                                  regularisation = 'SB_TV', \
+                                  regularisation_parameter = optimReg_sbtv,\
+                                  regularisation_iterations = 50)
+
+sliceSel = int(0.5*N_size)
+max_val = 1
+plt.figure(figsize = (20,3))
+gs1 = gridspec.GridSpec(1, 4)
+gs1.update(wspace=0.02, hspace=0.01) # set the spacing between axes. 
+ax1 = plt.subplot(gs1[0])
+plt.plot(reg_param_sb_vec, erros_vec_sbtv, color='k',linewidth=2)
+plt.xlabel('Regularisation parameter', fontsize=16)
+plt.ylabel('RMSE value', fontsize=16)
+plt.title('Regularisation selection', fontsize=19)
+ax2 = plt.subplot(gs1[1])
+plt.imshow(RecADMM_reg_sbtv[sliceSel,:,:],vmin=0, vmax=max_val, cmap="PuOr")
+ax2.plot([x0, x1], [y0, y1], 'ko-', linestyle='--')
+plt.title('ADMM-SBTV (X-Y) view', fontsize=19)
+#ax2.set_aspect('equal')
+ax3 = plt.subplot(gs1[2])
+plt.plot(phantom[sliceSel,sliceSel,0:N_size],color='k',linewidth=2)
+plt.plot(RecADMM_reg_sbtv[sliceSel,sliceSel,0:N_size],linestyle='--',color='g')
+plt.title('Profile', fontsize=19)
+ax4 = plt.subplot(gs1[3])
+plt.imshow(RecADMM_reg_sbtv[:,sliceSel,:],vmin=0, vmax=max_val, cmap="PuOr")
+plt.title('ADMM-SBTV (Y-Z) view', fontsize=19)
+plt.colorbar(ax=ax4)
+plt.show()
+plt.savefig('SBTV_phantom.pdf', format='pdf', dpi=1600)
+
+# calculate errors 
+Qtools = QualityTools(phantom, RecADMM_reg_sbtv)
+RMSE_admm_sbtv = Qtools.rmse()
+print("Root Mean Square Error for ADMM-SB-TV is {}".format(RMSE_admm_sbtv))
+
+# SSIM measure
+Qtools = QualityTools(phantom[128,:,:]*255, RecADMM_reg_sbtv[128,:,:]*235)
+win = np.array([gaussian(11, 1.5)])
+win2d = win * (win.T)
+ssim_admm_sbtv = Qtools.ssim(win2d)
+print("Mean SSIM ADMM-SBTV is {}".format(ssim_admm_sbtv[0]))
+#%%
+print ("Reconstructing with ADMM method using ROFLLT penalty")
+RecADMM_reg_rofllt = RectoolsIR.ADMM(projdata_norm,
+                                  rho_const = 2000.0, \
+                                  iterationsADMM = 25, \
+                                  regularisation = 'LLT_ROF', \
+                                  regularisation_parameter = optimReg_rofllt,\
+                                  regularisation_parameter2 = 0.0085,\
+                                  regularisation_iterations = 600)
+
+sliceSel = int(0.5*N_size)
+max_val = 1
+plt.figure(figsize = (20,3))
+gs1 = gridspec.GridSpec(1, 4)
+gs1.update(wspace=0.02, hspace=0.01) # set the spacing between axes. 
+ax1 = plt.subplot(gs1[0])
+plt.plot(reg_param_rofllt_vec, erros_vec_rofllt, color='k',linewidth=2)
+plt.xlabel('Regularisation parameter', fontsize=16)
+plt.ylabel('RMSE value', fontsize=16)
+plt.title('Regularisation selection', fontsize=19)
+ax2 = plt.subplot(gs1[1])
+plt.imshow(RecADMM_reg_rofllt[sliceSel,:,:],vmin=0, vmax=max_val, cmap="PuOr")
+ax2.plot([x0, x1], [y0, y1], 'ko-', linestyle='--')
+plt.title('ADMM-ROFLLT (X-Y) view', fontsize=19)
+#ax2.set_aspect('equal')
+ax3 = plt.subplot(gs1[2])
+plt.plot(phantom[sliceSel,sliceSel,0:N_size],color='k',linewidth=2)
+plt.plot(RecADMM_reg_rofllt[sliceSel,sliceSel,0:N_size],linestyle='--',color='g')
+plt.title('Profile', fontsize=19)
+ax4 = plt.subplot(gs1[3])
+plt.imshow(RecADMM_reg_rofllt[:,sliceSel,:],vmin=0, vmax=max_val, cmap="PuOr")
+plt.title('ADMM-ROFLLT (Y-Z) view', fontsize=19)
+plt.colorbar(ax=ax4)
+plt.show()
+#plt.savefig('ROFLLT_phantom.pdf', format='pdf', dpi=1600)
+
+# calculate errors 
+Qtools = QualityTools(phantom, RecADMM_reg_rofllt)
+RMSE_admm_rofllt = Qtools.rmse()
+print("Root Mean Square Error for ADMM-ROF-LLT is {}".format(RMSE_admm_rofllt))
+
+# SSIM measure
+Qtools = QualityTools(phantom[128,:,:]*255, RecADMM_reg_rofllt[128,:,:]*235)
+win = np.array([gaussian(11, 1.5)])
+win2d = win * (win.T)
+ssim_admm_rifllt = Qtools.ssim(win2d)
+print("Mean SSIM ADMM-ROFLLT is {}".format(ssim_admm_rifllt[0]))
+#%%
+print ("Reconstructing with ADMM method using TGV penalty")
+RecADMM_reg_tgv = RectoolsIR.ADMM(projdata_norm,
+                                  rho_const = 2000.0, \
+                                  iterationsADMM = 25, \
+                                  regularisation = 'TGV', \
+                                  regularisation_parameter = optimReg_tgv,\
+                                  regularisation_iterations = 600)
+#%%
+sliceSel = int(0.5*N_size)
+max_val = 1
+plt.figure(figsize = (20,3))
+gs1 = gridspec.GridSpec(1, 4)
+gs1.update(wspace=0.02, hspace=0.01) # set the spacing between axes. 
+ax1 = plt.subplot(gs1[0])
+plt.plot(reg_param_tgv_vec, erros_vec_tgv, color='k',linewidth=2)
+plt.xlabel('Regularisation parameter', fontsize=16)
+plt.ylabel('RMSE value', fontsize=16)
+plt.title('Regularisation selection', fontsize=19)
+ax2 = plt.subplot(gs1[1])
+plt.imshow(RecADMM_reg_tgv[sliceSel,:,:],vmin=0, vmax=max_val, cmap="PuOr")
+ax2.plot([x0, x1], [y0, y1], 'ko-', linestyle='--')
+plt.title('ADMM-TGV (X-Y) view', fontsize=19)
+#ax2.set_aspect('equal')
+ax3 = plt.subplot(gs1[2])
+plt.plot(phantom[sliceSel,sliceSel,0:N_size],color='k',linewidth=2)
+plt.plot(RecADMM_reg_tgv[sliceSel,sliceSel,0:N_size],linestyle='--',color='g')
+plt.title('Profile', fontsize=19)
+ax4 = plt.subplot(gs1[3])
+plt.imshow(RecADMM_reg_tgv[:,sliceSel,:],vmin=0, vmax=max_val, cmap="PuOr")
+plt.title('ADMM-TGV (Y-Z) view', fontsize=19)
+plt.colorbar(ax=ax4)
+plt.show()
+#plt.savefig('TGV_phantom.pdf', format='pdf', dpi=1600)
+
+# calculate errors 
+Qtools = QualityTools(phantom, RecADMM_reg_tgv)
+RMSE_admm_tgv = Qtools.rmse()
+print("Root Mean Square Error for ADMM-TGV is {}".format(RMSE_admm_tgv))
+
+# SSIM measure
+#Create a 2d gaussian for the window parameter
+Qtools = QualityTools(phantom[128,:,:]*255, RecADMM_reg_tgv[128,:,:]*235)
+win = np.array([gaussian(11, 1.5)])
+win2d = win * (win.T)
+ssim_admm_tgv = Qtools.ssim(win2d)
+print("Mean SSIM ADMM-TGV is {}".format(ssim_admm_tgv[0]))
+#%%
+\ No newline at end of file
diff --git a/Wrappers/Python/demos/SoftwareX_supp/Demo_SimulData_SX.py b/Wrappers/Python/demos/SoftwareX_supp/Demo_SimulData_SX.py
new file mode 100644
index 0000000..cdf4325
--- /dev/null
+++ b/Wrappers/Python/demos/SoftwareX_supp/Demo_SimulData_SX.py
@@ -0,0 +1,117 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+This demo scripts support the following publication: 
+"CCPi-Regularisation Toolkit for computed tomographic image reconstruction with 
+proximal splitting algorithms" by Daniil Kazantsev, Edoardo Pasca, Martin J. Turner,
+ Philip J. Withers; Software X, 2019
+____________________________________________________________________________
+* Runs TomoPhantom software to simulate tomographic projection data with
+some imaging errors and noise
+* Saves the data into hdf file to be uploaded in reconstruction scripts
+__________________________________________________________________________
+
+>>>>> Dependencies: <<<<<
+1. TomoPhantom software for phantom and data generation
+
+@author: Daniil Kazantsev, e:mail daniil.kazantsev@diamond.ac.uk
+Apache 2.0 license
+"""
+import timeit
+import os
+import matplotlib.pyplot as plt
+import numpy as np
+import tomophantom
+from tomophantom import TomoP3D
+from tomophantom.supp.flatsgen import flats
+from tomophantom.supp.normraw import normaliser_sim
+
+print ("Building 3D phantom using TomoPhantom software")
+tic=timeit.default_timer()
+model = 16 # select a model number from the library
+N_size = 256 # Define phantom dimensions using a scalar value (cubic phantom)
+path = os.path.dirname(tomophantom.__file__)
+path_library3D = os.path.join(path, "Phantom3DLibrary.dat")
+#This will generate a N_size x N_size x N_size phantom (3D)
+phantom_tm = TomoP3D.Model(model, N_size, path_library3D)
+toc=timeit.default_timer()
+Run_time = toc - tic
+print("Phantom has been built in {} seconds".format(Run_time))
+
+sliceSel = int(0.5*N_size)
+#plt.gray()
+plt.figure() 
+plt.subplot(131)
+plt.imshow(phantom_tm[sliceSel,:,:],vmin=0, vmax=1)
+plt.title('3D Phantom, axial view')
+
+plt.subplot(132)
+plt.imshow(phantom_tm[:,sliceSel,:],vmin=0, vmax=1)
+plt.title('3D Phantom, coronal view')
+
+plt.subplot(133)
+plt.imshow(phantom_tm[:,:,sliceSel],vmin=0, vmax=1)
+plt.title('3D Phantom, sagittal view')
+plt.show()
+
+# Projection geometry related parameters:
+Horiz_det = int(np.sqrt(2)*N_size) # detector column count (horizontal)
+Vert_det = N_size # detector row count (vertical) (no reason for it to be > N)
+angles_num = int(0.35*np.pi*N_size); # angles number
+angles = np.linspace(0.0,179.9,angles_num,dtype='float32') # in degrees
+angles_rad = angles*(np.pi/180.0)
+#%%
+print ("Building 3D analytical projection data with TomoPhantom")
+projData3D_analyt= TomoP3D.ModelSino(model, N_size, Horiz_det, Vert_det, angles, path_library3D)
+
+intens_max = N_size
+sliceSel = int(0.5*N_size)
+plt.figure() 
+plt.subplot(131)
+plt.imshow(projData3D_analyt[:,sliceSel,:],vmin=0, vmax=intens_max)
+plt.title('2D Projection (analytical)')
+plt.subplot(132)
+plt.imshow(projData3D_analyt[sliceSel,:,:],vmin=0, vmax=intens_max)
+plt.title('Sinogram view')
+plt.subplot(133)
+plt.imshow(projData3D_analyt[:,:,sliceSel],vmin=0, vmax=intens_max)
+plt.title('Tangentogram view')
+plt.show()
+#%%
+print ("Simulate flat fields, add noise and normalise projections...")
+flatsnum = 20 # generate 20 flat fields
+flatsSIM = flats(Vert_det, Horiz_det, maxheight = 0.1, maxthickness = 3, sigma_noise = 0.2, sigmasmooth = 3, flatsnum=flatsnum)
+
+plt.figure() 
+plt.imshow(flatsSIM[0,:,:],vmin=0, vmax=1)
+plt.title('A selected simulated flat-field')
+#%%
+# Apply normalisation of data and add noise
+flux_intensity = 60000 # controls the level of noise 
+sigma_flats = 0.01 # contro the level of noise in flats (higher creates more ring artifacts)
+projData3D_norm = normaliser_sim(projData3D_analyt, flatsSIM, sigma_flats, flux_intensity)
+
+intens_max = N_size
+sliceSel = int(0.5*N_size)
+plt.figure() 
+plt.subplot(131)
+plt.imshow(projData3D_norm[:,sliceSel,:],vmin=0, vmax=intens_max)
+plt.title('2D Projection (erroneous)')
+plt.subplot(132)
+plt.imshow(projData3D_norm[sliceSel,:,:],vmin=0, vmax=intens_max)
+plt.title('Sinogram view')
+plt.subplot(133)
+plt.imshow(projData3D_norm[:,:,sliceSel],vmin=0, vmax=intens_max)
+plt.title('Tangentogram view')
+plt.show()
+#%%
+import h5py
+import time
+time_label = int(time.time())
+# Saving generated data with a unique time label
+h5f = h5py.File('TomoSim_data'+str(time_label)+'.h5', 'w')
+h5f.create_dataset('phantom', data=phantom_tm)
+h5f.create_dataset('projdata_norm', data=projData3D_norm)
+h5f.create_dataset('proj_angles', data=angles_rad)
+h5f.close()
+#%%
+\ No newline at end of file
diff --git a/Wrappers/Python/demos/SoftwareX_supp/Readme.md b/Wrappers/Python/demos/SoftwareX_supp/Readme.md
new file mode 100644
index 0000000..54e83f1
--- /dev/null
+++ b/Wrappers/Python/demos/SoftwareX_supp/Readme.md
@@ -0,0 +1,26 @@
+
+# SoftwareX publication [1] supporting files
+
+## Decription:
+The scripts here support publication in SoftwareX journal [1] to ensure reproducibility of the research. The scripts linked with data shared at Zenodo. 
+
+## Data:
+Data is shared at Zenodo [here](https://doi.org/10.5281/zenodo.2578893)
+
+## Dependencies:
+1. [ASTRA toolbox](https://github.com/astra-toolbox/astra-toolbox): `conda install -c astra-toolbox astra-toolbox`
+2. [TomoRec](https://github.com/dkazanc/TomoRec): `conda install -c dkazanc tomorec`
+3. [Tomophantom](https://github.com/dkazanc/TomoPhantom): `conda install tomophantom -c ccpi`
+
+## Files description: 
+- `Demo_SimulData_SX.py` - simulates 3D projection data using [Tomophantom](https://github.com/dkazanc/TomoPhantom) software. One can skip this module if the data is taken from [Zenodo](https://doi.org/10.5281/zenodo.2578893)
+- `Demo_SimulData_ParOptimis_SX.py` - runs computationally extensive calculations for optimal regularisation parameters, the result are saved into directory `optim_param`. This script can be also skipped. 
+- `Demo_SimulData_Recon_SX.py` - using established regularisation parameters, one runs iterative reconstruction
+- `Demo_RealData_Recon_SX.py` - runs real data reconstructions. Can be quite intense on memory so reduce the size of the reconstructed volume if needed. 
+
+### References:
+[1] "CCPi-Regularisation Toolkit for computed tomographic image reconstruction with proximal splitting algorithms" by Daniil Kazantsev, Edoardo Pasca, Martin J. Turner and Philip J. Withers; SoftwareX, 2019. 
+
+### Acknowledgments:
+CCPi-RGL software is a product of the [CCPi](https://www.ccpi.ac.uk/) group, STFC SCD software developers and Diamond Light Source (DLS). Any relevant questions/comments can be e-mailed to Daniil Kazantsev at dkazanc@hotmail.com
+
diff --git a/Wrappers/Python/demos/SoftwareX_supp/optim_param/Optim_admm_rofllt.h5 b/Wrappers/Python/demos/SoftwareX_supp/optim_param/Optim_admm_rofllt.h5
new file mode 100644
index 0000000..63bc4fd
--- /dev/null
+++ b/Wrappers/Python/demos/SoftwareX_supp/optim_param/Optim_admm_rofllt.h5
diff --git a/Wrappers/Python/demos/SoftwareX_supp/optim_param/Optim_admm_sbtv.h5 b/Wrappers/Python/demos/SoftwareX_supp/optim_param/Optim_admm_sbtv.h5
new file mode 100644
index 0000000..03c0c14
--- /dev/null
+++ b/Wrappers/Python/demos/SoftwareX_supp/optim_param/Optim_admm_sbtv.h5
diff --git a/Wrappers/Python/demos/SoftwareX_supp/optim_param/Optim_admm_tgv.h5 b/Wrappers/Python/demos/SoftwareX_supp/optim_param/Optim_admm_tgv.h5
new file mode 100644
index 0000000..056d915
--- /dev/null
+++ b/Wrappers/Python/demos/SoftwareX_supp/optim_param/Optim_admm_tgv.h5
diff --git a/demos/demoMatlab_denoise.m b/demos/demoMatlab_denoise.m
index 5135129..5e92ee1 100644
--- a/demos/demoMatlab_denoise.m
+++ b/demos/demoMatlab_denoise.m
@@ -13,87 +13,119 @@ Im = double(imread('lena_gray_512.tif'))/255;  % loading image
 u0 = Im + .05*randn(size(Im)); u0(u0 < 0) = 0;
 figure; imshow(u0, [0 1]); title('Noisy image');
 
-lambda_reg = 0.03; % regularsation parameter for all methods
 %%
 fprintf('Denoise using the ROF-TV model (CPU) \n');
+lambda_reg = 0.017; % regularsation parameter for all methods
 tau_rof = 0.0025; % time-marching constant 
-iter_rof = 750; % number of ROF iterations
+iter_rof = 1200; % number of ROF iterations
 tic; u_rof = ROF_TV(single(u0), lambda_reg, iter_rof, tau_rof); toc; 
 energyfunc_val_rof = TV_energy(single(u_rof),single(u0),lambda_reg, 1);  % get energy function value
 rmseROF = (RMSE(u_rof(:),Im(:)));
 fprintf('%s %f \n', 'RMSE error for ROF-TV is:', rmseROF);
+[ssimval] = ssim(u_rof*255,single(Im)*255);
+fprintf('%s %f \n', 'MSSIM error for ROF-TV is:', ssimval);
 figure; imshow(u_rof, [0 1]); title('ROF-TV denoised image (CPU)');
 %%
 % fprintf('Denoise using the ROF-TV model (GPU) \n');
 % tau_rof = 0.0025; % time-marching constant 
-% iter_rof = 750; % number of ROF iterations
+% iter_rof = 1200; % number of ROF iterations
 % tic; u_rofG = ROF_TV_GPU(single(u0), lambda_reg, iter_rof, tau_rof); toc;
 % figure; imshow(u_rofG, [0 1]); title('ROF-TV denoised image (GPU)');
 %%
 fprintf('Denoise using the FGP-TV model (CPU) \n');
-iter_fgp = 1300; % number of FGP iterations
-epsil_tol =  1.0e-06; % tolerance
+lambda_reg = 0.033;
+iter_fgp = 300; % number of FGP iterations
+epsil_tol =  1.0e-09; % tolerance
 tic; u_fgp = FGP_TV(single(u0), lambda_reg, iter_fgp, epsil_tol); toc; 
 energyfunc_val_fgp = TV_energy(single(u_fgp),single(u0),lambda_reg, 1); % get energy function value
 rmseFGP = (RMSE(u_fgp(:),Im(:)));
 fprintf('%s %f \n', 'RMSE error for FGP-TV is:', rmseFGP);
+[ssimval] = ssim(u_fgp*255,single(Im)*255);
+fprintf('%s %f \n', 'MSSIM error for FGP-TV is:', ssimval);
 figure; imshow(u_fgp, [0 1]); title('FGP-TV denoised image (CPU)');
-
 %%
 % fprintf('Denoise using the FGP-TV model (GPU) \n');
-% iter_fgp = 1300; % number of FGP iterations
-% epsil_tol =  1.0e-06; % tolerance
+% iter_fgp = 300; % number of FGP iterations
+% epsil_tol =  1.0e-09; % tolerance
 % tic; u_fgpG = FGP_TV_GPU(single(u0), lambda_reg, iter_fgp, epsil_tol); toc; 
 % figure; imshow(u_fgpG, [0 1]); title('FGP-TV denoised image (GPU)');
 %%
 fprintf('Denoise using the SB-TV model (CPU) \n');
-iter_sb = 150; % number of SB iterations
-epsil_tol =  1.0e-06; % tolerance
+iter_sb = 80; % number of SB iterations
+epsil_tol =  1.0e-08; % tolerance
 tic; u_sb = SB_TV(single(u0), lambda_reg, iter_sb, epsil_tol); toc; 
 energyfunc_val_sb = TV_energy(single(u_sb),single(u0),lambda_reg, 1);  % get energy function value
 rmseSB = (RMSE(u_sb(:),Im(:)));
 fprintf('%s %f \n', 'RMSE error for SB-TV is:', rmseSB);
+[ssimval] = ssim(u_sb*255,single(Im)*255);
+fprintf('%s %f \n', 'MSSIM error for SB-TV is:', ssimval);
 figure; imshow(u_sb, [0 1]); title('SB-TV denoised image (CPU)');
 %%
 % fprintf('Denoise using the SB-TV model (GPU) \n');
-% iter_sb = 150; % number of SB iterations
+% iter_sb = 80; % number of SB iterations
 % epsil_tol =  1.0e-06; % tolerance
 % tic; u_sbG = SB_TV_GPU(single(u0), lambda_reg, iter_sb, epsil_tol); toc; 
 % figure; imshow(u_sbG, [0 1]); title('SB-TV denoised image (GPU)');
 %%
+fprintf('Denoise using Nonlinear-Diffusion model (CPU) \n');
+iter_diff = 450; % number of diffusion iterations
+lambda_regDiff = 0.025; % regularisation for the diffusivity 
+sigmaPar = 0.015; % edge-preserving parameter
+tau_param = 0.02; % time-marching constant 
+tic; u_diff = NonlDiff(single(u0), lambda_regDiff, sigmaPar, iter_diff, tau_param, 'Huber'); toc; 
+rmseDiffus = (RMSE(u_diff(:),Im(:)));
+fprintf('%s %f \n', 'RMSE error for Nonlinear Diffusion is:', rmseDiffus);
+[ssimval] = ssim(u_diff*255,single(Im)*255);
+fprintf('%s %f \n', 'MSSIM error for NDF is:', ssimval);
+figure; imshow(u_diff, [0 1]); title('Diffusion denoised image (CPU)');
+%%
+% fprintf('Denoise using Nonlinear-Diffusion model (GPU) \n');
+% iter_diff = 450; % number of diffusion iterations
+% lambda_regDiff = 0.025; % regularisation for the diffusivity 
+% sigmaPar = 0.015; % edge-preserving parameter
+% tau_param = 0.025; % time-marching constant 
+% tic; u_diff_g = NonlDiff_GPU(single(u0), lambda_regDiff, sigmaPar, iter_diff, tau_param, 'Huber'); toc; 
+% figure; imshow(u_diff_g, [0 1]); title('Diffusion denoised image (GPU)');
+%%
 fprintf('Denoise using the TGV model (CPU) \n');
-lambda_TGV = 0.045; % regularisation parameter
+lambda_TGV = 0.034; % regularisation parameter
 alpha1 = 1.0; % parameter to control the first-order term
-alpha0 = 2.0; % parameter to control the second-order term
-iter_TGV = 1500; % number of Primal-Dual iterations for TGV
+alpha0 = 1.0; % parameter to control the second-order term
+iter_TGV = 500; % number of Primal-Dual iterations for TGV
 tic; u_tgv = TGV(single(u0), lambda_TGV, alpha1, alpha0, iter_TGV); toc; 
 rmseTGV = (RMSE(u_tgv(:),Im(:)));
 fprintf('%s %f \n', 'RMSE error for TGV is:', rmseTGV);
+[ssimval] = ssim(u_tgv*255,single(Im)*255);
+fprintf('%s %f \n', 'MSSIM error for TGV is:', ssimval);
 figure; imshow(u_tgv, [0 1]); title('TGV denoised image (CPU)');
-
+%%
 % fprintf('Denoise using the TGV model (GPU) \n');
-% lambda_TGV = 0.045; % regularisation parameter
+% lambda_TGV = 0.034; % regularisation parameter
 % alpha1 = 1.0; % parameter to control the first-order term
-% alpha0 = 2.0; % parameter to control the second-order term
-% iter_TGV = 1500; % number of Primal-Dual iterations for TGV
+% alpha0 = 1.0; % parameter to control the second-order term
+% iter_TGV = 500; % number of Primal-Dual iterations for TGV
 % tic; u_tgv_gpu = TGV_GPU(single(u0), lambda_TGV, alpha1, alpha0, iter_TGV); toc; 
 % rmseTGV_gpu = (RMSE(u_tgv_gpu(:),Im(:)));
 % fprintf('%s %f \n', 'RMSE error for TGV is:', rmseTGV_gpu);
+% [ssimval] = ssim(u_tgv_gpu*255,single(Im)*255);
+% fprintf('%s %f \n', 'MSSIM error for TGV is:', ssimval);
 % figure; imshow(u_tgv_gpu, [0 1]); title('TGV denoised image (GPU)');
 %%
 fprintf('Denoise using the ROF-LLT model (CPU) \n');
-lambda_ROF = lambda_reg; % ROF regularisation parameter
-lambda_LLT = lambda_reg*0.45; % LLT regularisation parameter
-iter_LLT = 1; % iterations 
+lambda_ROF = 0.016; % ROF regularisation parameter
+lambda_LLT = lambda_reg*0.25; % LLT regularisation parameter
+iter_LLT = 500; % iterations 
 tau_rof_llt = 0.0025; % time-marching constant 
 tic; u_rof_llt = LLT_ROF(single(u0), lambda_ROF, lambda_LLT, iter_LLT, tau_rof_llt); toc; 
 rmseROFLLT = (RMSE(u_rof_llt(:),Im(:)));
 fprintf('%s %f \n', 'RMSE error for TGV is:', rmseROFLLT);
+[ssimval] = ssim(u_rof_llt*255,single(Im)*255);
+fprintf('%s %f \n', 'MSSIM error for ROFLLT is:', ssimval);
 figure; imshow(u_rof_llt, [0 1]); title('ROF-LLT denoised image (CPU)');
 %%
 % fprintf('Denoise using the ROF-LLT model (GPU) \n');
-% lambda_ROF = lambda_reg; % ROF regularisation parameter
-% lambda_LLT = lambda_reg*0.45; % LLT regularisation parameter
+% lambda_ROF = 0.016; % ROF regularisation parameter
+% lambda_LLT = lambda_reg*0.25; % LLT regularisation parameter
 % iter_LLT = 500; % iterations 
 % tau_rof_llt = 0.0025; % time-marching constant 
 % tic; u_rof_llt_g = LLT_ROF_GPU(single(u0), lambda_ROF, lambda_LLT, iter_LLT, tau_rof_llt); toc; 
@@ -101,32 +133,16 @@ figure; imshow(u_rof_llt, [0 1]); title('ROF-LLT denoised image (CPU)');
 % fprintf('%s %f \n', 'RMSE error for TGV is:', rmseROFLLT_g);
 % figure; imshow(u_rof_llt_g, [0 1]); title('ROF-LLT denoised image (GPU)');
 %%
-fprintf('Denoise using Nonlinear-Diffusion model (CPU) \n');
-iter_diff = 800; % number of diffusion iterations
-lambda_regDiff = 0.025; % regularisation for the diffusivity 
-sigmaPar = 0.015; % edge-preserving parameter
-tau_param = 0.025; % time-marching constant 
-tic; u_diff = NonlDiff(single(u0), lambda_regDiff, sigmaPar, iter_diff, tau_param, 'Huber'); toc; 
-rmseDiffus = (RMSE(u_diff(:),Im(:)));
-fprintf('%s %f \n', 'RMSE error for Nonlinear Diffusion is:', rmseDiffus);
-figure; imshow(u_diff, [0 1]); title('Diffusion denoised image (CPU)');
-%%
-% fprintf('Denoise using Nonlinear-Diffusion model (GPU) \n');
-% iter_diff = 800; % number of diffusion iterations
-% lambda_regDiff = 0.025; % regularisation for the diffusivity 
-% sigmaPar = 0.015; % edge-preserving parameter
-% tau_param = 0.025; % time-marching constant 
-% tic; u_diff_g = NonlDiff_GPU(single(u0), lambda_regDiff, sigmaPar, iter_diff, tau_param, 'Huber'); toc; 
-% figure; imshow(u_diff_g, [0 1]); title('Diffusion denoised image (GPU)');
-%%
 fprintf('Denoise using Fourth-order anisotropic diffusion model (CPU) \n');
 iter_diff = 800; % number of diffusion iterations
 lambda_regDiff = 3.5; % regularisation for the diffusivity 
-sigmaPar = 0.02; % edge-preserving parameter
+sigmaPar = 0.021; % edge-preserving parameter
 tau_param = 0.0015; % time-marching constant 
 tic; u_diff4 = Diffusion_4thO(single(u0), lambda_regDiff, sigmaPar, iter_diff, tau_param); toc; 
 rmseDiffHO = (RMSE(u_diff4(:),Im(:)));
 fprintf('%s %f \n', 'RMSE error for Fourth-order anisotropic diffusion is:', rmseDiffHO);
+[ssimval] = ssim(u_diff4*255,single(Im)*255);
+fprintf('%s %f \n', 'MSSIM error for DIFF4th is:', ssimval);
 figure; imshow(u_diff4, [0 1]); title('Diffusion 4thO denoised image (CPU)');
 %%
 % fprintf('Denoise using Fourth-order anisotropic diffusion model (GPU) \n');
@@ -146,10 +162,12 @@ tic; [H_i, H_j, Weights] = PatchSelect(single(u0), SearchingWindow, PatchWindow,
 %%
 fprintf('Denoise using Non-local Total Variation (CPU) \n');
 iter_nltv = 3; % number of nltv iterations
-lambda_nltv = 0.05; % regularisation parameter for nltv
+lambda_nltv = 0.055; % regularisation parameter for nltv
 tic; u_nltv = Nonlocal_TV(single(u0), H_i, H_j, 0, Weights, lambda_nltv, iter_nltv); toc; 
 rmse_nltv = (RMSE(u_nltv(:),Im(:)));
 fprintf('%s %f \n', 'RMSE error for Non-local Total Variation is:', rmse_nltv);
+[ssimval] = ssim(u_nltv*255,single(Im)*255);
+fprintf('%s %f \n', 'MSSIM error for NLTV is:', ssimval);
 figure; imagesc(u_nltv, [0 1]); colormap(gray); daspect([1 1 1]); title('Non-local Total Variation denoised image (CPU)');
 %%
 %>>>>>>>>>>>>>> MULTI-CHANNEL priors <<<<<<<<<<<<<<< %
diff --git a/src/Core/regularisers_GPU/TGV_GPU_core.cu b/src/Core/regularisers_GPU/TGV_GPU_core.cu
index e4abf72..849219b 100644
--- a/src/Core/regularisers_GPU/TGV_GPU_core.cu
+++ b/src/Core/regularisers_GPU/TGV_GPU_core.cu
@@ -38,49 +38,49 @@ limitations under the License.
  * [1] K. Bredies "Total Generalized Variation"
  */
     
+   
+#define BLKXSIZE2D 16
+#define BLKYSIZE2D 16
+
 #define BLKXSIZE 8
 #define BLKYSIZE 8
-#define BLKZSIZE 8    
-    
-#define BLKXSIZE2D 8
-#define BLKYSIZE2D 8
-#define EPS 1.0e-7
-#define idivup(a, b) ( ((a)%(b) != 0) ? (a)/(b)+1 : (a)/(b) )
+#define BLKZSIZE 8
 
+#define idivup(a, b) ( ((a)%(b) != 0) ? (a)/(b)+1 : (a)/(b) )
 
 /********************************************************************/
 /***************************2D Functions*****************************/
 /********************************************************************/
-__global__ void DualP_2D_kernel(float *U, float *V1, float *V2, float *P1, float *P2, int dimX, int dimY, float sigma)
+__global__ void DualP_2D_kernel(float *U, float *V1, float *V2, float *P1, float *P2, long dimX, long dimY, float sigma)
 {    
-	int num_total = dimX*dimY;
-	const int i = blockDim.x * blockIdx.x + threadIdx.x;
-        const int j = blockDim.y * blockIdx.y + threadIdx.y;
+	const long i = blockDim.x * blockIdx.x + threadIdx.x;
+        const long j = blockDim.y * blockIdx.y + threadIdx.y;
         
-        int index = i + dimX*j;       
+        long index = i + (dimX)*j;       
         
-        if (index < num_total) {  
+        if ((i < dimX) && (j < dimY)) {
         /* symmetric boundary conditions (Neuman) */            
         if ((i >= 0) && (i < dimX-1))  P1[index] += sigma*((U[(i+1) + dimX*j] - U[index])  - V1[index]); 
-        else P1[index] += sigma*(-V1[index]); 
+        else if  (i == dimX-1) P1[index] -= sigma*(V1[index]); 
+        else P1[index] = 0.0f; 
         if ((j >= 0) && (j < dimY-1))  P2[index] += sigma*((U[i + dimX*(j+1)] - U[index])  - V2[index]);
-        else P2[index] += sigma*(-V2[index]);                    
+        else if  (j == dimY-1) P2[index] -= sigma*(V2[index]);                    
+        else P2[index] = 0.0f; 
 	}
 	return;
 } 
 
-__global__ void ProjP_2D_kernel(float *P1, float *P2, int dimX, int dimY, float alpha1)
+__global__ void ProjP_2D_kernel(float *P1, float *P2, long dimX, long dimY, float alpha1)
 {
    	float grad_magn;
-	int num_total = dimX*dimY;
 	
-	const int i = blockDim.x * blockIdx.x + threadIdx.x;
-        const int j = blockDim.y * blockIdx.y + threadIdx.y;
+	const long i = blockDim.x * blockIdx.x + threadIdx.x;
+        const long j = blockDim.y * blockIdx.y + threadIdx.y;
         
-        int index = i + dimX*j;
+        long index = i + (dimX)*j;    
                 
-        if (index < num_total) {            
-            grad_magn = sqrtf(pow(P1[index],2) + pow(P2[index],2));
+        if ((i < dimX) && (j < dimY)) {            
+            grad_magn = sqrtf(powf(P1[index],2) + powf(P2[index],2));
             grad_magn = grad_magn/alpha1;
             if (grad_magn > 1.0f) {
                 P1[index] /= grad_magn;
@@ -90,17 +90,15 @@ __global__ void ProjP_2D_kernel(float *P1, float *P2, int dimX, int dimY, float
 	return;
 } 
 
-__global__ void DualQ_2D_kernel(float *V1, float *V2, float *Q1, float *Q2, float *Q3, int dimX, int dimY, float sigma)
+__global__ void DualQ_2D_kernel(float *V1, float *V2, float *Q1, float *Q2, float *Q3, long dimX, long dimY, float sigma)
 {
         float q1, q2, q11, q22;
-	int num_total = dimX*dimY;
-	
-	const int i = blockDim.x * blockIdx.x + threadIdx.x;
-        const int j = blockDim.y * blockIdx.y + threadIdx.y;
+	const long i = blockDim.x * blockIdx.x + threadIdx.x;
+        const long j = blockDim.y * blockIdx.y + threadIdx.y;
         
-        int index = i + dimX*j;      
+        long index = i + (dimX)*j;          
         
-        if (index < num_total) {
+        if ((i < dimX) && (j < dimY)) {   
          q1 = 0.0f; q2  = 0.0f; q11  = 0.0f; q22  = 0.0f;
          
 	        if ((i >= 0) && (i < dimX-1))  {            
@@ -120,18 +118,16 @@ __global__ void DualQ_2D_kernel(float *V1, float *V2, float *Q1, float *Q2, floa
 	return;
 } 
 
-__global__ void ProjQ_2D_kernel(float *Q1, float *Q2, float *Q3, int dimX, int dimY, float alpha0)
+__global__ void ProjQ_2D_kernel(float *Q1, float *Q2, float *Q3, long dimX, long dimY, float alpha0)
 {
 	float grad_magn;
-        int num_total = dimX*dimY;
-
-	const int i = blockDim.x * blockIdx.x + threadIdx.x;
-        const int j = blockDim.y * blockIdx.y + threadIdx.y;
+	const long i = blockDim.x * blockIdx.x + threadIdx.x;
+        const long j = blockDim.y * blockIdx.y + threadIdx.y;
         
-        int index = i + dimX*j;        
+        long index = i + (dimX)*j;     
        
-        if (index < num_total) {
-            grad_magn = sqrt(pow(Q1[index],2) + pow(Q2[index],2) + 2*pow(Q3[index],2));
+        if ((i < dimX) && (j < dimY)) {  
+            grad_magn = sqrt(powf(Q1[index],2) + powf(Q2[index],2) + 2*powf(Q3[index],2));
             grad_magn = grad_magn/alpha0;
             if (grad_magn > 1.0f) {
                 Q1[index] /= grad_magn;
@@ -142,26 +138,26 @@ __global__ void ProjQ_2D_kernel(float *Q1, float *Q2, float *Q3, int dimX, int d
 	return;
 } 
 
-__global__ void DivProjP_2D_kernel(float *U, float *U0, float *P1, float *P2, int dimX, int dimY, float lambda, float tau)
+__global__ void DivProjP_2D_kernel(float *U, float *U0, float *P1, float *P2, long dimX, long dimY, float lambda, float tau)
 {
 	float P_v1, P_v2, div;
-	int num_total = dimX*dimY;
-	
-	const int i = blockDim.x * blockIdx.x + threadIdx.x;
-        const int j = blockDim.y * blockIdx.y + threadIdx.y;       
+	const long i = blockDim.x * blockIdx.x + threadIdx.x;
+        const long j = blockDim.y * blockIdx.y + threadIdx.y;
         
-        int index = i + dimX*j;
+        long index = i + (dimX)*j;    
 
-        if (index < num_total) { 
-	P_v1 = 0.0f; P_v2 = 0.0f;
-        
-        if (i == 0) P_v1 = P1[index];
-        if (i == dimX-1) P_v1 = -P1[(i-1) + dimX*j];
+        if ((i < dimX) && (j < dimY)) {
+			        
         if ((i > 0) && (i < dimX-1)) P_v1 = P1[index] - P1[(i-1) + dimX*j];
+        else if (i == dimX-1) P_v1 = -P1[(i-1) + dimX*j];
+        else if (i == 0) P_v1 = P1[index];
+        else P_v1 = 0.0f;
 
-        if (j == 0) P_v2 = P2[index];
-        if (j == dimY-1) P_v2 = -P2[i + dimX*(j-1)];
       	if ((j > 0) && (j < dimY-1))  P_v2 = P2[index] - P2[i + dimX*(j-1)];
+      	else if (j == dimY-1) P_v2 = -P2[i + dimX*(j-1)];
+      	else if (j == 0) P_v2 = P2[index];
+      	else P_v2 = 0.0f;
+
          
         div = P_v1 + P_v2;
         U[index] = (lambda*(U[index] + tau*div) + tau*U0[index])/(lambda + tau);
@@ -169,18 +165,19 @@ __global__ void DivProjP_2D_kernel(float *U, float *U0, float *P1, float *P2, in
 	return;
 } 
 
-__global__ void UpdV_2D_kernel(float *V1, float *V2, float *P1, float *P2, float *Q1, float *Q2, float *Q3, int dimX, int dimY, float tau)
+__global__ void UpdV_2D_kernel(float *V1, float *V2, float *P1, float *P2, float *Q1, float *Q2, float *Q3, long dimX, long dimY, float tau)
 {
 	float q1, q3_x, q2, q3_y, div1, div2;
-	int num_total = dimX*dimY;
-	int i1, j1;
+	long i1, j1;
 	
-	const int i = blockDim.x * blockIdx.x + threadIdx.x;
-        const int j = blockDim.y * blockIdx.y + threadIdx.y;
+	const long i = blockDim.x * blockIdx.x + threadIdx.x;
+        const long j = blockDim.y * blockIdx.y + threadIdx.y;
         
-        int index = i + dimX*j;      
+        long index = i + (dimX)*j;        
       
-	if (index < num_total) {   
+        if ((i < dimX) && (j < dimY)) {     
+
+	q1 = 0.0f; q3_x = 0.0f; q2 = 0.0f; q3_y = 0.0f; div1 = 0.0f; div2= 0.0f;
 	
 	    i1 = (i-1) + dimX*j;
             j1 = (i) + dimX*(j-1);
@@ -222,94 +219,95 @@ __global__ void UpdV_2D_kernel(float *V1, float *V2, float *P1, float *P2, float
 	return;
 } 
 
-__global__ void copyIm_TGV_kernel(float *U, float *U_old, int N, int M, int num_total)
+__global__ void copyIm_TGV_kernel(float *U, float *U_old, long dimX, long dimY)
 {
-    int xIndex = blockDim.x * blockIdx.x + threadIdx.x;
-    int yIndex = blockDim.y * blockIdx.y + threadIdx.y;
-    
-    int index = xIndex + N*yIndex;
+	const long i = blockDim.x * blockIdx.x + threadIdx.x;
+        const long j = blockDim.y * blockIdx.y + threadIdx.y;
+        
+        long index = i + (dimX)*j;    
     
-    if (index < num_total)   {
+    if ((i < dimX) && (j < dimY)) {
         U_old[index] = U[index];
     }
 }
 
-__global__ void copyIm_TGV_kernel_ar2(float *V1, float *V2, float *V1_old, float *V2_old, int N, int M, int num_total)
+__global__ void copyIm_TGV_kernel_ar2(float *V1, float *V2, float *V1_old, float *V2_old, long dimX, long dimY)
 {
-    int xIndex = blockDim.x * blockIdx.x + threadIdx.x;
-    int yIndex = blockDim.y * blockIdx.y + threadIdx.y;
-    
-    int index = xIndex + N*yIndex;
+	const long i = blockDim.x * blockIdx.x + threadIdx.x;
+        const long j = blockDim.y * blockIdx.y + threadIdx.y;
+        
+        long index = i + (dimX)*j;    
     
-    if (index < num_total)   {
+    if ((i < dimX) && (j < dimY)) {
         V1_old[index] = V1[index];
         V2_old[index] = V2[index];
     }
 }
 
-__global__ void newU_kernel(float *U, float *U_old, int N, int M, int num_total)
+__global__ void newU_kernel(float *U, float *U_old, long dimX, long dimY)
 {
-    int xIndex = blockDim.x * blockIdx.x + threadIdx.x;
-    int yIndex = blockDim.y * blockIdx.y + threadIdx.y;
-    
-    int index = xIndex + N*yIndex;
+	const long i = blockDim.x * blockIdx.x + threadIdx.x;
+        const long j = blockDim.y * blockIdx.y + threadIdx.y;
+        
+        long index = i + (dimX)*j;    
     
-    if (index < num_total)	{
+    if ((i < dimX) && (j < dimY)) {
         U[index] = 2.0f*U[index] - U_old[index];
     }
 }
 
 
-__global__ void newU_kernel_ar2(float *V1, float *V2, float *V1_old, float *V2_old, int N, int M, int num_total)
+__global__ void newU_kernel_ar2(float *V1, float *V2, float *V1_old, float *V2_old, long dimX, long dimY)
 {
-    int xIndex = blockDim.x * blockIdx.x + threadIdx.x;
-    int yIndex = blockDim.y * blockIdx.y + threadIdx.y;
-    
-    int index = xIndex + N*yIndex;
+	const long i = blockDim.x * blockIdx.x + threadIdx.x;
+        const long j = blockDim.y * blockIdx.y + threadIdx.y;
+        
+        long index = i + (dimX)*j;    
     
-    if (index < num_total)	{
+    if ((i < dimX) && (j < dimY)) {
         V1[index] = 2.0f*V1[index] - V1_old[index];
         V2[index] = 2.0f*V2[index] - V2_old[index];  
     }
 }
+
 /********************************************************************/
 /***************************3D Functions*****************************/
 /********************************************************************/
-__global__ void DualP_3D_kernel(float *U, float *V1, float *V2, float *V3, float *P1, float *P2, float *P3, int dimX, int dimY, int dimZ, float sigma)
+__global__ void DualP_3D_kernel(float *U, float *V1, float *V2, float *V3, float *P1, float *P2, float *P3, long dimX, long dimY, long dimZ, float sigma)
 {    
-	int index;
-	const int i = blockDim.x * blockIdx.x + threadIdx.x;
-        const int j = blockDim.y * blockIdx.y + threadIdx.y;
-        const int k = blockDim.z * blockIdx.z + threadIdx.z;
-
-   	int num_total = dimX*dimY*dimZ;
-        
-        index = (dimX*dimY)*k + i*dimX+j;    
-        if (index < num_total) {                
+	long index;
+	const long i = blockDim.x * blockIdx.x + threadIdx.x;
+        const long j = blockDim.y * blockIdx.y + threadIdx.y;
+        const long k = blockDim.z * blockIdx.z + threadIdx.z;
+               
+        index = (dimX*dimY)*k + i*dimX+j;
+            
+        if ((i < dimX) && (j < dimY)  && (k < dimZ)) {           
             /* symmetric boundary conditions (Neuman) */            
             if ((i >= 0) && (i < dimX-1)) P1[index] += sigma*((U[(dimX*dimY)*k + (i+1)*dimX+j] - U[index])  - V1[index]);  
-	    else P1[index] += sigma*(-V1[index]); 
+	    else if (i == dimX-1) P1[index] -= sigma*(V1[index]); 
+	    else P1[index] = 0.0f;
 	    if ((j >= 0) && (j < dimY-1)) P2[index] += sigma*((U[(dimX*dimY)*k + i*dimX+(j+1)] - U[index])  - V2[index]);        
-	    else P2[index] += sigma*(-V2[index]);                
+	    else if (j == dimY-1) P2[index] -= sigma*(V2[index]);                
+	    else P2[index] = 0.0f;	    
       	    if ((k >= 0) && (k < dimZ-1)) P3[index] += sigma*((U[(dimX*dimY)*(k+1) + i*dimX+(j)] - U[index])  - V3[index]);        
-	    else P3[index] += sigma*(-V3[index]);                	    
-	 }	
+      	    else if (k == dimZ-1) P3[index] -= sigma*(V3[index]);                	    
+      	    else P3[index] = 0.0f;
+	 }	 
 	return;
-} 
+}
 
-__global__ void ProjP_3D_kernel(float *P1, float *P2, float *P3, int dimX, int dimY, int dimZ, float alpha1)
+__global__ void ProjP_3D_kernel(float *P1, float *P2, float *P3, long dimX, long dimY, long dimZ, float alpha1)
 {
    	float grad_magn;
-   	int index;
-   	int num_total = dimX*dimY*dimZ;
-   	
-	int i = blockDim.x * blockIdx.x + threadIdx.x;
-        int j = blockDim.y * blockIdx.y + threadIdx.y;
-        int k = blockDim.z * blockIdx.z + threadIdx.z;
+	long index;
+	const long i = blockDim.x * blockIdx.x + threadIdx.x;
+        const long j = blockDim.y * blockIdx.y + threadIdx.y;
+        const long k = blockDim.z * blockIdx.z + threadIdx.z;
         
         index = (dimX*dimY)*k + i*dimX+j;    
-        if (index < num_total) {            
-            grad_magn = (sqrtf(pow(P1[index],2) + pow(P2[index],2) + pow(P3[index],2)))/alpha1;
+        if ((i < dimX) && (j < dimY)  && (k < dimZ)) {                     
+            grad_magn = (sqrtf(powf(P1[index],2) + powf(P2[index],2) + powf(P3[index],2)))/alpha1;
             if (grad_magn > 1.0f) {
                 P1[index] /= grad_magn;
                 P2[index] /= grad_magn;
@@ -319,23 +317,22 @@ __global__ void ProjP_3D_kernel(float *P1, float *P2, float *P3, int dimX, int d
 	return;
 }
 
-__global__ void DualQ_3D_kernel(float *V1, float *V2, float *V3, float *Q1, float *Q2, float *Q3, float *Q4, float *Q5, float *Q6, int dimX, int dimY, int dimZ, float sigma)
+__global__ void DualQ_3D_kernel(float *V1, float *V2, float *V3, float *Q1, float *Q2, float *Q3, float *Q4, float *Q5, float *Q6, long dimX, long dimY, long dimZ, float sigma)
 {
-	int index; 
-        float q1, q2, q3, q11, q22, q33, q44, q55, q66;
 
-   	int num_total = dimX*dimY*dimZ;
-   	
-	int i = blockDim.x * blockIdx.x + threadIdx.x;
-        int j = blockDim.y * blockIdx.y + threadIdx.y;
-        int k = blockDim.z * blockIdx.z + threadIdx.z;
+        float q1, q2, q3, q11, q22, q33, q44, q55, q66;
+	long index;
+	
+	const long i = blockDim.x * blockIdx.x + threadIdx.x;
+        const long j = blockDim.y * blockIdx.y + threadIdx.y;
+        const long k = blockDim.z * blockIdx.z + threadIdx.z;
         
         index = (dimX*dimY)*k + i*dimX+j; 
-        int i1 = (dimX*dimY)*k + (i+1)*dimX+j;
-        int j1 = (dimX*dimY)*k + (i)*dimX+(j+1);
-        int k1 = (dimX*dimY)*(k+1) + (i)*dimX+(j);
+        long i1 = (dimX*dimY)*k + (i+1)*dimX+j;
+        long j1 = (dimX*dimY)*k + (i)*dimX+(j+1);
+        long k1 = (dimX*dimY)*(k+1) + (i)*dimX+(j);
                 
-        if (index < num_total) {
+        if ((i < dimX) && (j < dimY)  && (k < dimZ)) {           
  	q1 = 0.0f; q11 = 0.0f; q33 = 0.0f; q2 = 0.0f; q22 = 0.0f; q55 = 0.0f; q3 = 0.0f; q44 = 0.0f; q66 = 0.0f;
          
 	        /* boundary conditions (Neuman) */
@@ -362,20 +359,18 @@ __global__ void DualQ_3D_kernel(float *V1, float *V2, float *V3, float *Q1, floa
 	return;
 }
 
-__global__ void ProjQ_3D_kernel(float *Q1, float *Q2, float *Q3, float *Q4, float *Q5, float *Q6, int dimX, int dimY, int dimZ, float alpha0)
+__global__ void ProjQ_3D_kernel(float *Q1, float *Q2, float *Q3, float *Q4, float *Q5, float *Q6, long dimX, long dimY, long dimZ, float alpha0)
 {
 	float grad_magn;
-	int index;
-   	int num_total = dimX*dimY*dimZ;
-   	
-	int i = blockDim.x * blockIdx.x + threadIdx.x;
-        int j = blockDim.y * blockIdx.y + threadIdx.y;
-        int k = blockDim.z * blockIdx.z + threadIdx.z;        
+	long index;
+	const long i = blockDim.x * blockIdx.x + threadIdx.x;
+        const long j = blockDim.y * blockIdx.y + threadIdx.y;
+        const long k = blockDim.z * blockIdx.z + threadIdx.z;      
 
         index = (dimX*dimY)*k + i*dimX+j; 
         
-        if (index < num_total) {	
-	grad_magn = sqrtf(pow(Q1[index],2) + pow(Q2[index],2) + pow(Q3[index],2) + 2.0f*pow(Q4[index],2) + 2.0f*pow(Q5[index],2) + 2.0f*pow(Q6[index],2));
+        if ((i < dimX) && (j < dimY)  && (k < dimZ)) {           
+	grad_magn = sqrtf(powf(Q1[index],2) + powf(Q2[index],2) + powf(Q3[index],2) + 2.0f*powf(Q4[index],2) + 2.0f*powf(Q5[index],2) + 2.0f*powf(Q6[index],2));
             grad_magn = grad_magn/alpha0;
             if (grad_magn > 1.0f) {
                 Q1[index] /= grad_magn;
@@ -388,60 +383,56 @@ __global__ void ProjQ_3D_kernel(float *Q1, float *Q2, float *Q3, float *Q4, floa
 	}
 	return;
 } 
-__global__ void DivProjP_3D_kernel(float *U, float *U0, float *P1, float *P2, float *P3, int dimX, int dimY, int dimZ, float lambda, float tau)
+__global__ void DivProjP_3D_kernel(float *U, float *U0, float *P1, float *P2, float *P3, long dimX, long dimY, long dimZ, float lambda, float tau)
 {
 	float P_v1, P_v2, P_v3, div;
-	int index;
-   	int num_total = dimX*dimY*dimZ;
-   	
-   	
-	int i = blockDim.x * blockIdx.x + threadIdx.x;
-        int j = blockDim.y * blockIdx.y + threadIdx.y;
-        int k = blockDim.z * blockIdx.z + threadIdx.z;       
+	long index;
+	const long i = blockDim.x * blockIdx.x + threadIdx.x;
+        const long j = blockDim.y * blockIdx.y + threadIdx.y;
+        const long k = blockDim.z * blockIdx.z + threadIdx.z;      
      
         index = (dimX*dimY)*k + i*dimX+j; 
-        int i1 = (dimX*dimY)*k + (i-1)*dimX+j;
-        int j1 = (dimX*dimY)*k + (i)*dimX+(j-1);
-        int k1 = (dimX*dimY)*(k-1) + (i)*dimX+(j);        
-        
-        if (index < num_total) { 
-	P_v1 = 0.0f; P_v2 = 0.0f; P_v3 = 0.0f;
+        long i1 = (dimX*dimY)*k + (i-1)*dimX+j;
+        long j1 = (dimX*dimY)*k + (i)*dimX+(j-1);
+        long k1 = (dimX*dimY)*(k-1) + (i)*dimX+(j);        
         
-        if (i == 0) P_v1 = P1[index];
-        if (i == dimX-1) P_v1 = -P1[i1];
+        if ((i < dimX) && (j < dimY)  && (k < dimZ)) {           
+
         if ((i > 0) && (i < dimX-1)) P_v1 = P1[index] - P1[i1];
+        else if (i == dimX-1) P_v1 = -P1[i1];
+        else if (i == 0) P_v1 = P1[index];
+        else P_v1 = 0.0f;
 
-        if (j == 0) P_v2 = P2[index];
-        if (j == dimY-1) P_v2 = -P2[j1];
       	if ((j > 0) && (j < dimY-1))  P_v2 = P2[index] - P2[j1];
-        
-        if (k == 0) P_v3 = P3[index];
-        if (k == dimZ-1) P_v3 = -P3[k1];
-      	if ((k > 0) && (k < dimZ-1))  P_v3 = P3[index] - P3[k1];
-     
-                      
+      	else if (j == dimY-1) P_v2 = -P2[j1];
+        else if (j == 0) P_v2 = P2[index];
+        else P_v2 = 0.0f;
+
+      	if ((k > 0) && (k < dimZ-1))  P_v3 = P3[index] - P3[k1];                
+      	else if (k == dimZ-1) P_v3 = -P3[k1];
+        else if (k == 0) P_v3 = P3[index];
+        else P_v3 = 0.0f;        
+                     
         div = P_v1 + P_v2 + P_v3;
         U[index] = (lambda*(U[index] + tau*div) + tau*U0[index])/(lambda + tau);             
 	}
 	return;
 }
-__global__ void UpdV_3D_kernel(float *V1, float *V2, float *V3, float *P1, float *P2, float *P3, float *Q1, float *Q2, float *Q3, float *Q4, float *Q5, float *Q6, int dimX, int dimY, int dimZ, float tau)
+__global__ void UpdV_3D_kernel(float *V1, float *V2, float *V3, float *P1, float *P2, float *P3, float *Q1, float *Q2, float *Q3, float *Q4, float *Q5, float *Q6, long dimX, long dimY, long dimZ, float tau)
 {
 	float q1, q4x, q5x, q2, q4y, q6y, q6z, q5z, q3, div1, div2, div3;
-	int index;
-	int num_total = dimX*dimY*dimZ;
-	
-	int i = blockDim.x * blockIdx.x + threadIdx.x;
-        int j = blockDim.y * blockIdx.y + threadIdx.y;
-        int k = blockDim.z * blockIdx.z + threadIdx.z;
+	long index;
+	const long i = blockDim.x * blockIdx.x + threadIdx.x;
+        const long j = blockDim.y * blockIdx.y + threadIdx.y;
+        const long k = blockDim.z * blockIdx.z + threadIdx.z;
         
         index = (dimX*dimY)*k + i*dimX+j; 
-        int i1 = (dimX*dimY)*k + (i-1)*dimX+j;
-        int j1 = (dimX*dimY)*k + (i)*dimX+(j-1);
-        int k1 = (dimX*dimY)*(k-1) + (i)*dimX+(j);    
+        long i1 = (dimX*dimY)*k + (i-1)*dimX+j;
+        long j1 = (dimX*dimY)*k + (i)*dimX+(j-1);
+        long k1 = (dimX*dimY)*(k-1) + (i)*dimX+(j);    
         
         /* Q1 - Q11, Q2 - Q22, Q3 -  Q33, Q4 - Q21/Q12, Q5 - Q31/Q13, Q6 - Q32/Q23*/       
-        if (index < num_total) {  
+        if ((i < dimX) && (j < dimY)  && (k < dimZ)) {           
 
   	 /* boundary conditions (Neuman) */        
             if ((i > 0) && (i < dimX-1)) {
@@ -507,64 +498,60 @@ __global__ void UpdV_3D_kernel(float *V1, float *V2, float *V3, float *P1, float
 	return;
 } 
 
-__global__ void copyIm_TGV_kernel3D(float *U, float *U_old, int dimX, int dimY, int dimZ, int num_total)
+__global__ void copyIm_TGV_kernel3D(float *U, float *U_old, long dimX, long dimY, long dimZ)
 {
-    int index;
-	
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-    int j = blockDim.y * blockIdx.y + threadIdx.y;
-    int k = blockDim.z * blockIdx.z + threadIdx.z;    
+    long index;
+    const long i = blockDim.x * blockIdx.x + threadIdx.x;
+    const long j = blockDim.y * blockIdx.y + threadIdx.y;
+    const long k = blockDim.z * blockIdx.z + threadIdx.z; 
     
     index = (dimX*dimY)*k + j*dimX+i;
     
-    if (index < num_total) {	
+    if ((i < dimX) && (j < dimY)  && (k < dimZ)) {           
       	U_old[index] = U[index];	
     }
 }
 
-__global__ void copyIm_TGV_kernel3D_ar3(float *V1, float *V2, float *V3, float *V1_old, float *V2_old, float *V3_old, int dimX, int dimY, int dimZ, int num_total)
+__global__ void copyIm_TGV_kernel3D_ar3(float *V1, float *V2, float *V3, float *V1_old, float *V2_old, float *V3_old, long dimX, long dimY, long dimZ)
 {
-    int index;
-	
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-    int j = blockDim.y * blockIdx.y + threadIdx.y;
-    int k = blockDim.z * blockIdx.z + threadIdx.z;    
+	long index;
+	const long i = blockDim.x * blockIdx.x + threadIdx.x;
+        const long j = blockDim.y * blockIdx.y + threadIdx.y;
+        const long k = blockDim.z * blockIdx.z + threadIdx.z; 
     
     index = (dimX*dimY)*k + j*dimX+i;
     
-    if (index < num_total) {	
+    if ((i < dimX) && (j < dimY)  && (k < dimZ)) {           	
       	V1_old[index] = V1[index];
 	V2_old[index] = V2[index];
 	V3_old[index] = V3[index];	
     }
 }
 
-__global__ void newU_kernel3D(float *U, float *U_old, int dimX, int dimY, int dimZ, int num_total)
+__global__ void newU_kernel3D(float *U, float *U_old, int dimX, int dimY, int dimZ)
 {
-     int index;
-	
-     int i = blockDim.x * blockIdx.x + threadIdx.x;
-     int j = blockDim.y * blockIdx.y + threadIdx.y;
-     int k = blockDim.z * blockIdx.z + threadIdx.z;    
+	long index;
+	const long i = blockDim.x * blockIdx.x + threadIdx.x;
+        const long j = blockDim.y * blockIdx.y + threadIdx.y;
+        const long k = blockDim.z * blockIdx.z + threadIdx.z;
          
      index = (dimX*dimY)*k + j*dimX+i;
     
-    if (index < num_total) {
+    if ((i < dimX) && (j < dimY)  && (k < dimZ)) {           
 	   U[index] = 2.0f*U[index] - U_old[index];
     }
 }  
 
-__global__ void newU_kernel3D_ar3(float *V1, float *V2, float *V3, float *V1_old, float *V2_old, float *V3_old, int dimX, int dimY, int dimZ, int num_total)
+__global__ void newU_kernel3D_ar3(float *V1, float *V2, float *V3, float *V1_old, float *V2_old, float *V3_old, long dimX, long dimY, long dimZ)
 {
-     int index;
-	
-     int i = blockDim.x * blockIdx.x + threadIdx.x;
-     int j = blockDim.y * blockIdx.y + threadIdx.y;
-     int k = blockDim.z * blockIdx.z + threadIdx.z;    
+	long index;
+	const long i = blockDim.x * blockIdx.x + threadIdx.x;
+        const long j = blockDim.y * blockIdx.y + threadIdx.y;
+        const long k = blockDim.z * blockIdx.z + threadIdx.z;
          
      index = (dimX*dimY)*k + j*dimX+i;
     
-    if (index < num_total) {
+    if ((i < dimX) && (j < dimY)  && (k < dimZ)) {           
 	   V1[index] = 2.0f*V1[index] - V1_old[index];
 	   V2[index] = 2.0f*V2[index] - V2_old[index];
 	   V3[index] = 2.0f*V3[index] - V3_old[index];
@@ -576,14 +563,20 @@ __global__ void newU_kernel3D_ar3(float *V1, float *V2, float *V3, float *V1_old
 /*%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*/
 extern "C" int TGV_GPU_main(float *U0, float *U, float lambda, float alpha1, float alpha0, int iterationsNumb, float L2, int dimX, int dimY, int dimZ)
 {
-	int dimTotal, dev = 0;
-	CHECK(cudaSetDevice(dev));
-	
-	dimTotal = dimX*dimY*dimZ;
+
+        int deviceCount = -1; // number of devices
+        cudaGetDeviceCount(&deviceCount);
+        if (deviceCount == 0) {
+         fprintf(stderr, "No CUDA devices found\n");
+        return -1;
+        }	
+
+	long dimTotal = (long)(dimX*dimY*dimZ);
+
        
         float *U_old, *d_U0, *d_U, *P1, *P2, *Q1, *Q2, *Q3, *V1, *V1_old, *V2, *V2_old, tau, sigma;
-        tau = pow(L2,-0.5);
-        sigma = pow(L2,-0.5);
+        tau = powf(L2,-0.5f);
+        sigma = tau;
                                       
         CHECK(cudaMalloc((void**)&d_U0,dimTotal*sizeof(float)));
         CHECK(cudaMalloc((void**)&d_U,dimTotal*sizeof(float)));
@@ -611,41 +604,51 @@ extern "C" int TGV_GPU_main(float *U0, float *U, float lambda, float alpha1, flo
         
         if (dimZ == 1) {
 	/*2D case */
-        dim3 dimBlock(BLKXSIZE2D,BLKYSIZE2D);
-        dim3 dimGrid(idivup(dimX,BLKXSIZE2D), idivup(dimY,BLKYSIZE2D));
+	dim3 dimBlock(BLKXSIZE2D,BLKYSIZE2D);
+	dim3 dimGrid(idivup(dimX,BLKXSIZE2D), idivup(dimY,BLKYSIZE2D));
              
         for(int n=0; n < iterationsNumb; n++) {
 			
 	    /* Calculate Dual Variable P */
-            DualP_2D_kernel<<<dimGrid,dimBlock>>>(d_U, V1, V2, P1, P2, dimX, dimY, sigma);
-      	    CHECK(cudaDeviceSynchronize());
+            DualP_2D_kernel<<<dimGrid,dimBlock>>>(d_U, V1, V2, P1, P2, (long)(dimX), (long)(dimY), sigma);
+      	    checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() );
             /*Projection onto convex set for P*/
-            ProjP_2D_kernel<<<dimGrid,dimBlock>>>(P1, P2, dimX, dimY, alpha1);
-            CHECK(cudaDeviceSynchronize());
+            ProjP_2D_kernel<<<dimGrid,dimBlock>>>(P1, P2, (long)(dimX), (long)(dimY), alpha1);
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() );            
             /* Calculate Dual Variable Q */
-            DualQ_2D_kernel<<<dimGrid,dimBlock>>>(V1, V2, Q1, Q2, Q3, dimX, dimY, sigma);
-            CHECK(cudaDeviceSynchronize());
+            DualQ_2D_kernel<<<dimGrid,dimBlock>>>(V1, V2, Q1, Q2, Q3, (long)(dimX), (long)(dimY), sigma);
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() );
             /*Projection onto convex set for Q*/
-            ProjQ_2D_kernel<<<dimGrid,dimBlock>>>(Q1, Q2, Q3, dimX, dimY, alpha0);
-            CHECK(cudaDeviceSynchronize());
+            ProjQ_2D_kernel<<<dimGrid,dimBlock>>>(Q1, Q2, Q3, (long)(dimX), (long)(dimY), alpha0);
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() );
             /*saving U into U_old*/
-            copyIm_TGV_kernel<<<dimGrid,dimBlock>>>(d_U, U_old, dimX, dimY, dimTotal);
-            CHECK(cudaDeviceSynchronize());
+            copyIm_TGV_kernel<<<dimGrid,dimBlock>>>(d_U, U_old, (long)(dimX), (long)(dimY));
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() );
             /*adjoint operation  -> divergence and projection of P*/
-            DivProjP_2D_kernel<<<dimGrid,dimBlock>>>(d_U, d_U0, P1, P2, dimX, dimY, lambda, tau);
-            CHECK(cudaDeviceSynchronize());
+            DivProjP_2D_kernel<<<dimGrid,dimBlock>>>(d_U, d_U0, P1, P2, (long)(dimX), (long)(dimY), lambda, tau);
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() );
             /*get updated solution U*/
-            newU_kernel<<<dimGrid,dimBlock>>>(d_U, U_old, dimX, dimY, dimTotal);
-            CHECK(cudaDeviceSynchronize());
+            newU_kernel<<<dimGrid,dimBlock>>>(d_U, U_old, (long)(dimX), (long)(dimY));
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() );
             /*saving V into V_old*/
-            copyIm_TGV_kernel_ar2<<<dimGrid,dimBlock>>>(V1, V2, V1_old, V2_old, dimX, dimY, dimTotal);
-            CHECK(cudaDeviceSynchronize());
+            copyIm_TGV_kernel_ar2<<<dimGrid,dimBlock>>>(V1, V2, V1_old, V2_old, (long)(dimX), (long)(dimY));
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() );
             /* upd V*/
-            UpdV_2D_kernel<<<dimGrid,dimBlock>>>(V1, V2, P1, P2, Q1, Q2, Q3, dimX, dimY, tau);
-            CHECK(cudaDeviceSynchronize());
+            UpdV_2D_kernel<<<dimGrid,dimBlock>>>(V1, V2, P1, P2, Q1, Q2, Q3, (long)(dimX), (long)(dimY), tau);
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() );
             /*get new V*/
-            newU_kernel_ar2<<<dimGrid,dimBlock>>>(V1, V2, V1_old, V2_old, dimX, dimY, dimTotal);
-            CHECK(cudaDeviceSynchronize());            
+            newU_kernel_ar2<<<dimGrid,dimBlock>>>(V1, V2, V1_old, V2_old, (long)(dimX), (long)(dimY));
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() );
 	    }
         }
         else {
@@ -671,35 +674,45 @@ extern "C" int TGV_GPU_main(float *U0, float *U, float lambda, float alpha1, flo
         for(int n=0; n < iterationsNumb; n++) {
 			
 	    /* Calculate Dual Variable P */
-            DualP_3D_kernel<<<dimGrid,dimBlock>>>(d_U, V1, V2, V3, P1, P2, P3, dimX, dimY, dimZ, sigma);
-	    CHECK(cudaDeviceSynchronize());
+            DualP_3D_kernel<<<dimGrid,dimBlock>>>(d_U, V1, V2, V3, P1, P2, P3, (long)(dimX), (long)(dimY), (long)(dimZ), sigma);
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() );
             /*Projection onto convex set for P*/
-            ProjP_3D_kernel<<<dimGrid,dimBlock>>>(P1, P2, P3, dimX, dimY, dimZ, alpha1);
-            CHECK(cudaDeviceSynchronize());
+            ProjP_3D_kernel<<<dimGrid,dimBlock>>>(P1, P2, P3, (long)(dimX), (long)(dimY), (long)(dimZ), alpha1);
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() );
             /* Calculate Dual Variable Q */
-            DualQ_3D_kernel<<<dimGrid,dimBlock>>>(V1, V2, V3, Q1, Q2, Q3, Q4, Q5, Q6, dimX, dimY, dimZ, sigma);
-            CHECK(cudaDeviceSynchronize());
+            DualQ_3D_kernel<<<dimGrid,dimBlock>>>(V1, V2, V3, Q1, Q2, Q3, Q4, Q5, Q6, (long)(dimX), (long)(dimY), (long)(dimZ), sigma);
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() );
              /*Projection onto convex set for Q*/
-            ProjQ_3D_kernel<<<dimGrid,dimBlock>>>(Q1, Q2, Q3, Q4, Q5, Q6, dimX, dimY, dimZ, alpha0);
-            CHECK(cudaDeviceSynchronize());
+            ProjQ_3D_kernel<<<dimGrid,dimBlock>>>(Q1, Q2, Q3, Q4, Q5, Q6, (long)(dimX), (long)(dimY), (long)(dimZ), alpha0);
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() );
             /*saving U into U_old*/
-            copyIm_TGV_kernel3D<<<dimGrid,dimBlock>>>(d_U, U_old, dimX, dimY, dimZ, dimTotal);
-            CHECK(cudaDeviceSynchronize());
+            copyIm_TGV_kernel3D<<<dimGrid,dimBlock>>>(d_U, U_old, (long)(dimX), (long)(dimY), (long)(dimZ));
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() );
             /*adjoint operation  -> divergence and projection of P*/
-            DivProjP_3D_kernel<<<dimGrid,dimBlock>>>(d_U, d_U0, P1, P2, P3, dimX, dimY, dimZ, lambda, tau);
-            CHECK(cudaDeviceSynchronize());
+            DivProjP_3D_kernel<<<dimGrid,dimBlock>>>(d_U, d_U0, P1, P2, P3, (long)(dimX), (long)(dimY), (long)(dimZ), lambda, tau);
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() );
             /*get updated solution U*/
-            newU_kernel3D<<<dimGrid,dimBlock>>>(d_U, U_old, dimX, dimY, dimZ, dimTotal);
-            CHECK(cudaDeviceSynchronize());
+            newU_kernel3D<<<dimGrid,dimBlock>>>(d_U, U_old, (long)(dimX), (long)(dimY), (long)(dimZ));
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() );
             /*saving V into V_old*/
-            copyIm_TGV_kernel3D_ar3<<<dimGrid,dimBlock>>>(V1, V2, V3, V1_old, V2_old, V3_old, dimX, dimY, dimZ, dimTotal);           
-            CHECK(cudaDeviceSynchronize());
+            copyIm_TGV_kernel3D_ar3<<<dimGrid,dimBlock>>>(V1, V2, V3, V1_old, V2_old, V3_old, (long)(dimX), (long)(dimY), (long)(dimZ));           
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() );
             /* upd V*/
-            UpdV_3D_kernel<<<dimGrid,dimBlock>>>(V1, V2, V3, P1, P2, P3, Q1, Q2, Q3, Q4, Q5, Q6, dimX, dimY, dimZ, tau);
-            CHECK(cudaDeviceSynchronize());
+            UpdV_3D_kernel<<<dimGrid,dimBlock>>>(V1, V2, V3, P1, P2, P3, Q1, Q2, Q3, Q4, Q5, Q6, (long)(dimX), (long)(dimY), (long)(dimZ), tau);
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() );
             /*get new V*/
-            newU_kernel3D_ar3<<<dimGrid,dimBlock>>>(V1, V2, V3, V1_old, V2_old, V3_old, dimX, dimY, dimZ, dimTotal);
-            CHECK(cudaDeviceSynchronize());            
+            newU_kernel3D_ar3<<<dimGrid,dimBlock>>>(V1, V2, V3, V1_old, V2_old, V3_old, (long)(dimX), (long)(dimY), (long)(dimZ));
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() );
 	        }
 	        
         CHECK(cudaFree(Q4));
@@ -724,5 +737,7 @@ extern "C" int TGV_GPU_main(float *U0, float *U, float lambda, float alpha1, flo
         CHECK(cudaFree(V2));
         CHECK(cudaFree(V1_old));
         CHECK(cudaFree(V2_old));
+
+        cudaDeviceReset();
         return 0;
 }
diff --git a/src/Core/regularisers_GPU/TGV_GPU_core.h b/src/Core/regularisers_GPU/TGV_GPU_core.h
index 9f73d1c..e8f9c6e 100644
--- a/src/Core/regularisers_GPU/TGV_GPU_core.h
+++ b/src/Core/regularisers_GPU/TGV_GPU_core.h
@@ -1,6 +1,8 @@
 #ifndef __TGV_GPU_H__
 #define __TGV_GPU_H__
+
 #include "CCPiDefines.h"
+#include <memory.h>
 #include <stdio.h>
 
 extern "C" CCPI_EXPORT int TGV_GPU_main(float *U0, float *U, float lambda, float alpha1, float alpha0, int iterationsNumb, float L2, int dimX, int dimY, int dimZ);
author	vagrant <vagrant@localhost.localdomain>	2019-02-28 15:00:39 +0000
committer	vagrant <vagrant@localhost.localdomain>	2019-02-28 15:00:39 +0000
commit	364a703de9f31b35d4301f3e913f519be9d3a14f (patch)
tree	a398909ff87b22745829657f3e62b0439a64ad77
parent	7bb99cfd904b23c041be273ffc2746296e6eb814 (diff)
parent	4c728cf72345f7ab7967380cb536529fd9b1403d (diff)
download	regularization-364a703de9f31b35d4301f3e913f519be9d3a14f.tar.gz regularization-364a703de9f31b35d4301f3e913f519be9d3a14f.tar.bz2 regularization-364a703de9f31b35d4301f3e913f519be9d3a14f.tar.xz regularization-364a703de9f31b35d4301f3e913f519be9d3a14f.zip