4 files changed, 111 insertions, 214 deletions
diff --git a/Wrappers/Python/CMakeLists.txt b/Wrappers/Python/CMakeLists.txt
index 9104afd..0c24540 100644
--- a/Wrappers/Python/CMakeLists.txt
+++ b/Wrappers/Python/CMakeLists.txt
@@ -76,7 +76,9 @@ if (BUILD_PYTHON_WRAPPER)
                         )
         endif()
         #set (PYTHON_DEST ${CMAKE_INSTALL_PREFIX}/python/)
+
         install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/build/lib/ccpi
+
                 DESTINATION ${PYTHON_DEST})
         install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/data/ DESTINATION ${CMAKE_INSTALL_PREFIX}/share/ccpi)
         #file(TOUCH ${PYTHON_DEST}/edo/__init__.py)
diff --git a/Wrappers/Python/ccpi/optimisation/operators/GradientOperator.py b/Wrappers/Python/ccpi/optimisation/operators/GradientOperator.py
index 6391cf7..a45c3d2 100644
--- a/Wrappers/Python/ccpi/optimisation/operators/GradientOperator.py
+++ b/Wrappers/Python/ccpi/optimisation/operators/GradientOperator.py
@@ -25,6 +25,11 @@ from ccpi.framework import ImageData, ImageGeometry, BlockGeometry, BlockDataCon
 import numpy 
 import warnings
 
+#default nThreads
+import multiprocessing
+cpus = multiprocessing.cpu_count()
+NUM_THREADS = max(int(cpus/2),1)
+
 NEUMANN = 'Neumann'
 PERIODIC = 'Periodic'
 C = 'c'
@@ -51,10 +56,11 @@ class Gradient(LinearOperator):
           'Space' or 'SpaceChannels', defaults to 'Space'
         * *backend* (``str``) --
           'c' or 'numpy', defaults to 'c' if correlation is 'SpaceChannels' or channels = 1
-   
-                                                        
+        * *num_threads* (``int``) --
+          If backend is 'c' specify the number of threads to use. Default is number of cpus/2          
+                 
+                 
         Example (2D): 
-
         .. math::
         
           \nabla : X -> Y \\
@@ -85,7 +91,7 @@ class Gradient(LinearOperator):
         if backend == NUMPY:
             self.operator = Gradient_numpy(gm_domain, bnd_cond=bnd_cond, **kwargs)
         else:
-            self.operator = Gradient_C(gm_domain, bnd_cond=bnd_cond)
+            self.operator = Gradient_C(gm_domain, bnd_cond=bnd_cond, **kwargs)
 
 
     def direct(self, x, out=None):
@@ -183,7 +189,7 @@ class Gradient_numpy(LinearOperator):
         
         # Call FiniteDiff operator        
         self.FD = FiniteDiff(self.gm_domain, direction = 0, bnd_cond = self.bnd_cond)
-                                                         
+        print("Initialised GradientOperator with numpy backend")               
         
     def direct(self, x, out=None):
         
@@ -275,7 +281,8 @@ class Gradient_numpy(LinearOperator):
             spMat = SparseFiniteDiff(self.gm_domain, direction=self.ind[i], bnd_cond=self.bnd_cond)
             res.append(spMat.sum_abs_col())
         return BlockDataContainer(*res)
-   
+
+
 import ctypes, platform
 
 # check for the extension
@@ -288,14 +295,13 @@ elif platform.system() == 'Darwin':
 else:
     raise ValueError('Not supported platform, ', platform.system())
 
-#print ("dll location", dll)
-cilacc = ctypes.cdll.LoadLibrary(dll)
 
-#FD = ctypes.CDLL(dll)
+cilacc = ctypes.cdll.LoadLibrary(dll)
 
 c_float_p = ctypes.POINTER(ctypes.c_float)
 
 cilacc.openMPtest.restypes = ctypes.c_int32
+cilacc.openMPtest.argtypes = [ctypes.c_int32]
 
 cilacc.fdiff4D.argtypes = [ctypes.POINTER(ctypes.c_float),
                        ctypes.POINTER(ctypes.c_float),
@@ -307,6 +313,7 @@ cilacc.fdiff4D.argtypes = [ctypes.POINTER(ctypes.c_float),
                        ctypes.c_long,
                        ctypes.c_long,
                        ctypes.c_int32,
+                       ctypes.c_int32,
                        ctypes.c_int32]
 
 cilacc.fdiff3D.argtypes = [ctypes.POINTER(ctypes.c_float),
@@ -317,6 +324,7 @@ cilacc.fdiff3D.argtypes = [ctypes.POINTER(ctypes.c_float),
                        ctypes.c_long,
                        ctypes.c_long,
                        ctypes.c_int32,
+                       ctypes.c_int32,
                        ctypes.c_int32]
 
 cilacc.fdiff2D.argtypes = [ctypes.POINTER(ctypes.c_float),
@@ -325,6 +333,7 @@ cilacc.fdiff2D.argtypes = [ctypes.POINTER(ctypes.c_float),
                        ctypes.c_long,
                        ctypes.c_long,
                        ctypes.c_int32,
+                       ctypes.c_int32,
                        ctypes.c_int32]
 
 
@@ -336,10 +345,12 @@ class Gradient_C(LinearOperator):
                      on 2D, 3D, 4D ImageData
                      under Neumann/Periodic boundary conditions'''
 
-    def __init__(self, gm_domain, gm_range=None, bnd_cond = NEUMANN):
+    def __init__(self, gm_domain, gm_range=None, bnd_cond = NEUMANN, **kwargs):
 
         super(Gradient_C, self).__init__() 
 
+        self.num_threads = kwargs.get('num_threads',NUM_THREADS)
+
         self.gm_domain = gm_domain
         self.gm_range = gm_range
         
@@ -362,8 +373,9 @@ class Gradient_C(LinearOperator):
             self.fd = cilacc.fdiff2D
         else:
             raise ValueError('Number of dimensions not supported, expected 2, 3 or 4, got {}'.format(len(gm_domain.shape)))
-        
-                        
+      #self.num_threads
+        print("Initialised GradientOperator with C backend running with ", cilacc.openMPtest(self.num_threads)," threads")               
+
     @staticmethod 
     def datacontainer_as_c_pointer(x):
         ndx = x.as_array()
@@ -377,9 +389,10 @@ class Gradient_C(LinearOperator):
             out = self.gm_range.allocate(None)
             return_val = True
 
+        #pass list of all arguments
         arg1 = [Gradient_C.datacontainer_as_c_pointer(out.get_item(i))[1] for i in range(self.gm_range.shape[0])]
         arg2 = [el for el in x.shape]
-        args = arg1 + arg2 + [self.bnd_cond, 1]
+        args = arg1 + arg2 + [self.bnd_cond, 1, self.num_threads]
         self.fd(x_p, *args)
         
         if return_val is True:
@@ -397,7 +410,7 @@ class Gradient_C(LinearOperator):
 
         arg1 = [Gradient_C.datacontainer_as_c_pointer(x.get_item(i))[1] for i in range(self.gm_range.shape[0])]
         arg2 = [el for el in out.shape]
-        args = arg1 + arg2 + [self.bnd_cond, 0]
+        args = arg1 + arg2 + [self.bnd_cond, 0, self.num_threads]
 
         self.fd(out_p, *args)
 
diff --git a/src/Core/CMakeLists.txt b/src/Core/CMakeLists.txt
index 741b985..e828fe5 100644
--- a/src/Core/CMakeLists.txt
+++ b/src/Core/CMakeLists.txt
@@ -1,5 +1,6 @@
 
 set (CMAKE_C_STANDARD 11)
+set (CMAKE_BUILD_TYPE Release)
 
 if(APPLE)
     if(NOT DEFINED OPENMP_INCLUDES OR NOT DEFINED OPENMP_LIBRARIES)
diff --git a/src/Core/FiniteDifferenceLibrary.c b/src/Core/FiniteDifferenceLibrary.c
index f5736e2..fbf2646 100644
--- a/src/Core/FiniteDifferenceLibrary.c
+++ b/src/Core/FiniteDifferenceLibrary.c
@@ -2,161 +2,66 @@
 
 #include "FiniteDifferenceLibrary.h"
 
-DLL_EXPORT int openMPtest(void)
+DLL_EXPORT int openMPtest(int nThreads)
 {
-	int nThreads;
+	omp_set_num_threads(nThreads);
+
+	int nThreads_running;
 #pragma omp parallel
 	{
 		if (omp_get_thread_num() == 0)
 		{
-			nThreads = omp_get_num_threads();
+			nThreads_running = omp_get_num_threads();
 		}
 	}
-	return nThreads;
+	return nThreads_running;
 }
-int fdiff_direct_neumann(const float *inimagefull, float *outimageXfull, float *outimageYfull, float *outimageZfull, float *outimageCfull, long nx, long ny, long nz, long nc)
+void threads_setup(int nThreads_requested, int *nThreads_current)
 {
-	size_t volume = nx * ny * nz;
-
-	const float * inimage = inimagefull;
-	float * outimageX = outimageXfull;
-	float * outimageY = outimageYfull;
-	float * outimageZ = outimageZfull;
-
-	int offset1 = (nz - 1) * nx * ny; //ind to beginning of last slice
-	int offset2 = offset1 + (ny - 1) * nx; //ind to beginning of last row
-
-	long c;
-	for (c = 0; c < nc; c++)
-	{
 #pragma omp parallel
-		{
-			long ind, k, j, i;
-			float pix0;
-			//run over all and then fix boundaries
-#pragma omp for
-			for (ind = 0; ind < nx * ny * (nz - 1); ind++)
-			{
-				pix0 = -inimage[ind];
-
-				outimageX[ind] = pix0 + inimage[ind + 1];
-				outimageY[ind] = pix0 + inimage[ind + nx];
-				outimageZ[ind] = pix0 + inimage[ind + nx * ny];
-			}
-
-#pragma omp for
-			for (ind = 0; ind < nx * (ny - 1); ind++)
-			{
-				pix0 = -inimage[ind + offset1];
-
-				outimageX[ind + offset1] = pix0 + inimage[ind + offset1 + 1];
-				outimageY[ind + offset1] = pix0 + inimage[ind + offset1 + nx];
-			}
-
-#pragma omp for
-			for (ind = 0; ind < nx - 1; ind++)
-			{
-				pix0 = -inimage[ind + offset2];
-
-				outimageX[ind + offset2] = pix0 + inimage[ind + offset2 + 1];
-			}
-
-			//boundaries
-#pragma omp for
-			for (k = 0; k < nz; k++)
-			{
-				for (i = 0; i < nx; i++)
-				{
-					outimageY[(k * ny * nx) + (ny - 1) * nx + i] = 0;
-				}
-			}
-
-#pragma omp for
-			for (k = 0; k < nz; k++)
-			{
-				for (j = 0; j < ny; j++)
-				{
-					outimageX[k * ny * nx + j * nx + nx - 1] = 0;
-				}
-			}
-
-			if (nz > 1)
-			{
-#pragma omp for
-				for (ind = 0; ind < ny * nx; ind++)
-				{
-					outimageZ[nx * ny * (nz - 1) + ind] = 0;
-				}
-			}
-		}
-
-		inimage += volume;
-		outimageX += volume;
-		outimageY += volume;
-		outimageZ += volume;
-	}
-
-
-	//now the rest of the channels
-	if (nc > 1)
 	{
-		long ind;
-
-		for (c = 0; c < nc - 1; c++)
-		{
-			float * outimageC = outimageCfull + c * volume;
-			const float * inimage = inimagefull + c * volume;
-
-#pragma omp parallel for
-			for (ind = 0; ind < volume; ind++)
-			{
-				outimageC[ind] = -inimage[ind] + inimage[ind + volume];
-			}
-		}
-
-#pragma omp parallel for
-		for (ind = 0; ind < volume; ind++)
+		if (omp_get_thread_num() == 0)
 		{
-			outimageCfull[(nc - 1) * volume + ind] = 0;
+			*nThreads_current = omp_get_num_threads();
 		}
 	}
-
-	return 0;
+	omp_set_num_threads(nThreads_requested);
 }
-int fdiff_direct_neumann_L21(const float *inimagefull, float *outimageL21normfull, float *outimageXfull, float *outimageYfull, float *outimageZfull, float *outimageCfull, long nx, long ny, long nz, long nc)
+
+int fdiff_direct_neumann(const float *inimagefull, float *outimageXfull, float *outimageYfull, float *outimageZfull, float *outimageCfull, long nx, long ny, long nz, long nc)
 {
 	size_t volume = nx * ny * nz;
-	int z_dim = nz - 1;
-
-	const float * inimage = inimagefull;
-	float * outimageX = outimageXfull;
-	float * outimageY = outimageYfull;
-	float * outimageZ = outimageZfull;
-	float * outimageL21norm = outimageL21normfull;
 
+	const float *inimage = inimagefull;
+	float *outimageX = outimageXfull;
+	float *outimageY = outimageYfull;
+	float *outimageZ = outimageZfull;
 
-	int offset1 = (nz - 1) * nx * ny; //ind to beginning of last slice
+	int offset1 = (nz - 1) * nx * ny;	  //ind to beginning of last slice
 	int offset2 = offset1 + (ny - 1) * nx; //ind to beginning of last row
 
 	long c;
+	
+	int z_dim = nz > 1 ? 1: 0;
+
 	for (c = 0; c < nc; c++)
 	{
 #pragma omp parallel
 		{
-			long ind, k;
+			long ind, k, j, i;
 			float pix0;
 			//run over all and then fix boundaries
-#pragma omp for
+
+#pragma omp for nowait
 			for (ind = 0; ind < nx * ny * (nz - 1); ind++)
 			{
 				pix0 = -inimage[ind];
-
 				outimageX[ind] = pix0 + inimage[ind + 1];
 				outimageY[ind] = pix0 + inimage[ind + nx];
 				outimageZ[ind] = pix0 + inimage[ind + nx * ny];
 			}
 
-#pragma omp for
+#pragma omp for nowait
 			for (ind = 0; ind < nx * (ny - 1); ind++)
 			{
 				pix0 = -inimage[ind + offset1];
@@ -169,29 +74,26 @@ int fdiff_direct_neumann_L21(const float *inimagefull, float *outimageL21normful
 			for (ind = 0; ind < nx - 1; ind++)
 			{
 				pix0 = -inimage[ind + offset2];
-
 				outimageX[ind + offset2] = pix0 + inimage[ind + offset2 + 1];
 			}
 
 			//boundaries
-#pragma omp for
+#pragma omp for nowait
 			for (k = 0; k < nz; k++)
 			{
-				for (int i = 0; i < nx; i++)
+				for (i = 0; i < nx; i++)
 				{
 					outimageY[(k * ny * nx) + (ny - 1) * nx + i] = 0;
 				}
 			}
-
-#pragma omp for
+#pragma omp for nowait
 			for (k = 0; k < nz; k++)
 			{
-				for (int j = 0; j < ny; j++)
+				for (j = 0; j < ny; j++)
 				{
 					outimageX[k * ny * nx + j * nx + nx - 1] = 0;
 				}
 			}
-
 			if (z_dim)
 			{
 #pragma omp for
@@ -199,34 +101,15 @@ int fdiff_direct_neumann_L21(const float *inimagefull, float *outimageL21normful
 				{
 					outimageZ[nx * ny * (nz - 1) + ind] = 0;
 				}
-
-#pragma omp for
-				for (ind = 0; ind < volume; ind++)
-				{
-					outimageL21norm[ind] = outimageX[ind] * outimageX[ind] + outimageY[ind] * outimageY[ind] + outimageZ[ind] * outimageZ[ind];
-				}
-
-			}
-			else
-			{
-
-#pragma omp for
-				for (ind = 0; ind < volume; ind++)
-				{
-					outimageL21norm[ind] = outimageX[ind] * outimageX[ind] + outimageY[ind] * outimageY[ind];
-				}
 			}
 		}
 
-	
 		inimage += volume;
 		outimageX += volume;
 		outimageY += volume;
 		outimageZ += volume;
-		outimageL21norm += volume;
 	}
 
-
 	//now the rest of the channels
 	if (nc > 1)
 	{
@@ -234,18 +117,13 @@ int fdiff_direct_neumann_L21(const float *inimagefull, float *outimageL21normful
 
 		for (c = 0; c < nc - 1; c++)
 		{
-			float * outimageC = outimageCfull + c * volume;
-			float * outimageL21norm = outimageL21normfull + c * volume;
-			const float * inimage = inimagefull + c * volume;
+			float *outimageC = outimageCfull + c * volume;
+			const float *inimage = inimagefull + c * volume;
 
 #pragma omp parallel for
 			for (ind = 0; ind < volume; ind++)
 			{
 				outimageC[ind] = -inimage[ind] + inimage[ind + volume];
-				outimageL21norm[ind] += outimageC[ind] * outimageC[ind];
-
-				//sqrt
-				outimageL21norm[ind] = (float)sqrt(outimageL21norm[ind]);
 			}
 		}
 
@@ -262,15 +140,14 @@ int fdiff_direct_periodic(const float *inimagefull, float *outimageXfull, float
 {
 	size_t volume = nx * ny * nz;
 
-	const float * inimage = inimagefull;
-	float * outimageX = outimageXfull;
-	float * outimageY = outimageYfull;
-	float * outimageZ = outimageZfull;
+	const float *inimage = inimagefull;
+	float *outimageX = outimageXfull;
+	float *outimageY = outimageYfull;
+	float *outimageZ = outimageZfull;
 
-	int offset1 = (nz - 1) * nx * ny; //ind to beginning of last slice
+	int offset1 = (nz - 1) * nx * ny;	  //ind to beginning of last slice
 	int offset2 = offset1 + (ny - 1) * nx; //ind to beginning of last row
 
-
 	long c;
 	for (c = 0; c < nc; c++)
 	{
@@ -280,7 +157,7 @@ int fdiff_direct_periodic(const float *inimagefull, float *outimageXfull, float
 			long ind, k;
 			float pix0;
 			//run over all and then fix boundaries
-#pragma omp for
+#pragma omp for nowait
 			for (ind = 0; ind < nx * ny * (nz - 1); ind++)
 			{
 				pix0 = -inimage[ind];
@@ -290,7 +167,7 @@ int fdiff_direct_periodic(const float *inimagefull, float *outimageXfull, float
 				outimageZ[ind] = pix0 + inimage[ind + nx * ny];
 			}
 
-#pragma omp for
+#pragma omp for nowait
 			for (ind = 0; ind < nx * (ny - 1); ind++)
 			{
 				pix0 = -inimage[ind + offset1];
@@ -308,7 +185,7 @@ int fdiff_direct_periodic(const float *inimagefull, float *outimageXfull, float
 			}
 
 			//boundaries
-#pragma omp for
+#pragma omp for nowait
 			for (k = 0; k < nz; k++)
 			{
 				for (int i = 0; i < nx; i++)
@@ -320,7 +197,7 @@ int fdiff_direct_periodic(const float *inimagefull, float *outimageXfull, float
 				}
 			}
 
-#pragma omp for
+#pragma omp for nowait
 			for (k = 0; k < nz; k++)
 			{
 				for (int j = 0; j < ny; j++)
@@ -332,10 +209,9 @@ int fdiff_direct_periodic(const float *inimagefull, float *outimageXfull, float
 				}
 			}
 
-
 			if (nz > 1)
 			{
-#pragma omp for
+#pragma omp for nowait
 				for (ind = 0; ind < ny * nx; ind++)
 				{
 					outimageZ[nx * ny * (nz - 1) + ind] = -inimage[nx * ny * (nz - 1) + ind] + inimage[ind];
@@ -356,8 +232,8 @@ int fdiff_direct_periodic(const float *inimagefull, float *outimageXfull, float
 
 		for (c = 0; c < nc - 1; c++)
 		{
-			float * outimageC = outimageCfull + c * volume;
-			const float * inimage = inimagefull + c * volume;
+			float *outimageC = outimageCfull + c * volume;
+			const float *inimage = inimagefull + c * volume;
 
 #pragma omp parallel for
 			for (ind = 0; ind < volume; ind++)
@@ -383,18 +259,18 @@ int fdiff_adjoint_neumann(float *outimagefull, const float *inimageXfull, const
 	//assumes nx and ny > 1
 	int z_dim = nz - 1;
 
-	float * outimage = outimagefull;
-	const float * inimageX = inimageXfull;
-	const float * inimageY = inimageYfull;
-	const float * inimageZ = inimageZfull;
+	float *outimage = outimagefull;
+	const float *inimageX = inimageXfull;
+	const float *inimageY = inimageYfull;
+	const float *inimageZ = inimageZfull;
 
-	float * tempX = (float*)malloc(volume * sizeof(float));
-	float * tempY = (float*)malloc(volume * sizeof(float));
-	float * tempZ;
-	
-	if(z_dim)
+	float *tempX = (float *)malloc(volume * sizeof(float));
+	float *tempY = (float *)malloc(volume * sizeof(float));
+	float *tempZ;
+
+	if (z_dim)
 	{
-		tempZ = (float*)malloc(volume * sizeof(float));
+		tempZ = (float *)malloc(volume * sizeof(float));
 	}
 
 	long c;
@@ -413,7 +289,6 @@ int fdiff_adjoint_neumann(float *outimagefull, const float *inimageXfull, const
 			for (ind = nx; ind < nx * ny * nz; ind++)
 			{
 				tempY[ind] = -inimageY[ind] + inimageY[ind - nx];
-
 			}
 
 			//boundaries
@@ -442,7 +317,7 @@ int fdiff_adjoint_neumann(float *outimagefull, const float *inimageXfull, const
 				for (ind = nx * ny; ind < nx * ny * nz; ind++)
 				{
 					tempZ[ind] = -inimageZ[ind] + inimageZ[ind - nx * ny];
-				}				
+				}
 #pragma omp for
 				for (ind = 0; ind < ny * nx; ind++)
 				{
@@ -454,7 +329,6 @@ int fdiff_adjoint_neumann(float *outimagefull, const float *inimageXfull, const
 				{
 					outimage[ind] = tempX[ind] + tempY[ind] + tempZ[ind];
 				}
-
 			}
 			else
 			{
@@ -464,7 +338,6 @@ int fdiff_adjoint_neumann(float *outimagefull, const float *inimageXfull, const
 					outimage[ind] = tempX[ind] + tempY[ind];
 				}
 			}
-
 		}
 
 		outimage += volume;
@@ -475,10 +348,9 @@ int fdiff_adjoint_neumann(float *outimagefull, const float *inimageXfull, const
 	free(tempX);
 	free(tempY);
 
-	if(z_dim)
+	if (z_dim)
 		free(tempZ);
 
-
 	//	//now the rest of the channels
 	if (nc > 1)
 	{
@@ -500,7 +372,6 @@ int fdiff_adjoint_neumann(float *outimagefull, const float *inimageXfull, const
 			outimagefull[ind] += -inimageCfull[ind];
 			outimagefull[(nc - 1) * volume + ind] += inimageCfull[(nc - 2) * volume + ind];
 		}
-
 	}
 
 	return 0;
@@ -513,18 +384,18 @@ int fdiff_adjoint_periodic(float *outimagefull, const float *inimageXfull, const
 	//assumes nx and ny > 1
 	int z_dim = nz - 1;
 
-	float * outimage = outimagefull;
-	const float * inimageX = inimageXfull;
-	const float * inimageY = inimageYfull;
-	const float * inimageZ = inimageZfull;
+	float *outimage = outimagefull;
+	const float *inimageX = inimageXfull;
+	const float *inimageY = inimageYfull;
+	const float *inimageZ = inimageZfull;
 
-	float * tempX = (float*)malloc(volume * sizeof(float));
-	float * tempY = (float*)malloc(volume * sizeof(float));
-	float * tempZ;
+	float *tempX = (float *)malloc(volume * sizeof(float));
+	float *tempY = (float *)malloc(volume * sizeof(float));
+	float *tempZ;
 
 	if (z_dim)
 	{
-		tempZ = (float*)malloc(volume * sizeof(float));
+		tempZ = (float *)malloc(volume * sizeof(float));
 	}
 
 	long c;
@@ -583,7 +454,6 @@ int fdiff_adjoint_periodic(float *outimagefull, const float *inimageXfull, const
 				{
 					outimage[ind] = tempX[ind] + tempY[ind] + tempZ[ind];
 				}
-
 			}
 			else
 			{
@@ -593,7 +463,6 @@ int fdiff_adjoint_periodic(float *outimagefull, const float *inimageXfull, const
 					outimage[ind] = tempX[ind] + tempY[ind];
 				}
 			}
-
 		}
 
 		outimage += volume;
@@ -604,7 +473,7 @@ int fdiff_adjoint_periodic(float *outimagefull, const float *inimageXfull, const
 	free(tempX);
 	free(tempY);
 
-	if(z_dim)
+	if (z_dim)
 		free(tempZ);
 
 	//now the rest of the channels
@@ -632,9 +501,11 @@ int fdiff_adjoint_periodic(float *outimagefull, const float *inimageXfull, const
 	return 0;
 }
 
-
-DLL_EXPORT int fdiff4D(float *imagefull, float *gradCfull, float *gradZfull, float *gradYfull, float *gradXfull, long nc, long nz, long ny, long nx, int boundary, int direction)
+DLL_EXPORT int fdiff4D(float *imagefull, float *gradCfull, float *gradZfull, float *gradYfull, float *gradXfull, long nc, long nz, long ny, long nx, int boundary, int direction, int nThreads)
 {
+	int nThreads_initial;
+	threads_setup(nThreads, &nThreads_initial);
+
 	if (boundary)
 	{
 		if (direction)
@@ -643,17 +514,21 @@ DLL_EXPORT int fdiff4D(float *imagefull, float *gradCfull, float *gradZfull, flo
 			fdiff_adjoint_periodic(imagefull, gradXfull, gradYfull, gradZfull, gradCfull, nx, ny, nz, nc);
 	}
 	else
-		{
+	{
 		if (direction)
 			fdiff_direct_neumann(imagefull, gradXfull, gradYfull, gradZfull, gradCfull, nx, ny, nz, nc);
 		else
 			fdiff_adjoint_neumann(imagefull, gradXfull, gradYfull, gradZfull, gradCfull, nx, ny, nz, nc);
 	}
-	
+
+	omp_set_num_threads(nThreads_initial);
 	return 0;
 }
-DLL_EXPORT int fdiff3D(float *imagefull, float *gradZfull, float *gradYfull, float *gradXfull, long nz, long ny, long nx, int boundary, int direction)
+DLL_EXPORT int fdiff3D(float *imagefull, float *gradZfull, float *gradYfull, float *gradXfull, long nz, long ny, long nx, int boundary, int direction, int nThreads)
 {
+	int nThreads_initial;
+	threads_setup(nThreads, &nThreads_initial);
+
 	if (boundary)
 	{
 		if (direction)
@@ -669,10 +544,14 @@ DLL_EXPORT int fdiff3D(float *imagefull, float *gradZfull, float *gradYfull, flo
 			fdiff_adjoint_neumann(imagefull, gradXfull, gradYfull, gradZfull, NULL, nx, ny, nz, 1);
 	}
 
+	omp_set_num_threads(nThreads_initial);
 	return 0;
 }
-DLL_EXPORT int fdiff2D(float *imagefull, float *gradYfull, float *gradXfull, long ny, long nx, int boundary, int direction)
+DLL_EXPORT int fdiff2D(float *imagefull, float *gradYfull, float *gradXfull, long ny, long nx, int boundary, int direction, int nThreads)
 {
+	int nThreads_initial;
+	threads_setup(nThreads, &nThreads_initial);
+
 	if (boundary)
 	{
 		if (direction)
@@ -688,5 +567,7 @@ DLL_EXPORT int fdiff2D(float *imagefull, float *gradYfull, float *gradXfull, lon
 			fdiff_adjoint_neumann(imagefull, gradXfull, gradYfull, NULL, NULL, nx, ny, 1, 1);
 	}
 
+	omp_set_num_threads(nThreads_initial);
 	return 0;
 }
+