From d2d4b4bd1db461fb9215abd20f829ce0298f3876 Mon Sep 17 00:00:00 2001
From: dkazanc <dkazanc@hotmail.com>
Date: Tue, 3 Sep 2019 17:02:54 +0100
Subject: CPU fixed, GPU almost

---
 build/run.sh                                       |   6 +-
 src/Core/regularisers_CPU/Nonlocal_TV_core.c       |  73 ++++---
 src/Core/regularisers_CPU/Nonlocal_TV_core.h       |  18 +-
 src/Core/regularisers_CPU/PatchSelect_core.c       | 164 ++++----------
 src/Core/regularisers_CPU/PatchSelect_core.h       |   7 +-
 src/Core/regularisers_GPU/PatchSelect_GPU_core.cu  | 236 +++++++++++----------
 .../mex_compile/regularisers_CPU/Nonlocal_TV.c     |  30 +--
 .../mex_compile/regularisers_CPU/PatchSelect.c     |  24 +--
 src/Python/src/cpu_regularisers.pyx                |   8 +-
 9 files changed, 246 insertions(+), 320 deletions(-)

diff --git a/build/run.sh b/build/run.sh
index c243b12..285476b 100755
--- a/build/run.sh
+++ b/build/run.sh
@@ -6,9 +6,9 @@ rm -r ../build_proj
 mkdir ../build_proj
 cd ../build_proj/
 #make clean
-export CIL_VERSION=19.06
+export CIL_VERSION=19.09
 # install Python modules without CUDA
-# cmake ../ -DBUILD_PYTHON_WRAPPER=ON -DBUILD_MATLAB_WRAPPER=OFF -DBUILD_CUDA=OFF -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=./install
+#cmake ../ -DBUILD_PYTHON_WRAPPER=ON -DBUILD_MATLAB_WRAPPER=OFF -DBUILD_CUDA=OFF -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=./install
 # install Python modules with CUDA
 cmake ../ -DBUILD_PYTHON_WRAPPER=ON -DBUILD_MATLAB_WRAPPER=OFF -DBUILD_CUDA=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=./install
 # install Matlab modules without CUDA
@@ -21,7 +21,7 @@ make install
 cp install/lib/libcilreg.so install/python/ccpi/filters
 cd install/python
 export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:../lib
-spyder
+spyder --new-instance
 ############### Matlab(linux)###############
 # export LD_PRELOAD=/home/algol/anaconda3/lib/libstdc++.so.6  # if there is libstdc error in matlab
 # PATH="/path/to/mex/:$PATH" LD_LIBRARY_PATH="/path/to/library:$LD_LIBRARY_PATH" matlab
diff --git a/src/Core/regularisers_CPU/Nonlocal_TV_core.c b/src/Core/regularisers_CPU/Nonlocal_TV_core.c
index c4c9118..de1c173 100644
--- a/src/Core/regularisers_CPU/Nonlocal_TV_core.c
+++ b/src/Core/regularisers_CPU/Nonlocal_TV_core.c
@@ -1,11 +1,11 @@
 /*
  * This work is part of the Core Imaging Library developed by
  * Visual Analytics and Imaging System Group of the Science Technology
- * Facilities Council, STFC and Diamond Light Source Ltd. 
+ * Facilities Council, STFC and Diamond Light Source Ltd.
  *
  * Copyright 2017 Daniil Kazantsev
  * Copyright 2017 Srikanth Nagella, Edoardo Pasca
- * Copyright 2018 Diamond Light Source Ltd. 
+ * Copyright 2018 Diamond Light Source Ltd.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -31,50 +31,55 @@
  * 2. AR_i - indeces of i neighbours
  * 3. AR_j - indeces of j neighbours
  * 4. AR_k - indeces of k neighbours (0 - for 2D case)
- * 5. Weights_ij(k) - associated weights 
+ * 5. Weights_ij(k) - associated weights
  * 6. regularisation parameter
- * 7. iterations number 
- 
+ * 7. iterations number
+
  * Output:
- * 1. denoised image/volume 	
+ * 1. denoised image/volume
  * Elmoataz, Abderrahim, Olivier Lezoray, and Sébastien Bougleux. "Nonlocal discrete regularization on weighted graphs: a framework for image and manifold processing." IEEE Trans. Image Processing 17, no. 7 (2008): 1047-1060.
- 
+
  */
 /*****************************************************************************/
 
-float Nonlocal_TV_CPU_main(float *A_orig, float *Output, unsigned short *H_i, unsigned short *H_j, unsigned short *H_k, float *Weights, int dimX, int dimY, int dimZ, int NumNeighb, float lambdaReg, int IterNumb)
+float Nonlocal_TV_CPU_main(float *A_orig, float *Output, unsigned short *H_i, unsigned short *H_j, unsigned short *H_k, float *Weights, int dimX, int dimY, int dimZ, int NumNeighb, float lambdaReg, int IterNumb, int switchM)
 {
 
     long i, j, k;
     int iter;
     lambdaReg = 1.0f/lambdaReg;
-         
+
     /*****2D INPUT *****/
     if (dimZ == 0) {
 	  copyIm(A_orig, Output, (long)(dimX), (long)(dimY), 1l);
     /* for each pixel store indeces of the most similar neighbours (patches) */
-     for(iter=0; iter<IterNumb; iter++) {    
+     for(iter=0; iter<IterNumb; iter++) {
 #pragma omp parallel for shared (A_orig, Output, Weights, H_i, H_j, iter) private(i,j)
+   for(j=0; j<(long)(dimY); j++) {
       for(i=0; i<(long)(dimX); i++) {
-            for(j=0; j<(long)(dimY); j++) {              
              /*NLM_H1_2D(Output, A_orig, H_i, H_j, Weights, i, j, (long)(dimX), (long)(dimY), NumNeighb, lambdaReg);*/  /* NLM - H1 penalty */
-             NLM_TV_2D(Output, A_orig, H_i, H_j, Weights, i, j, (long)(dimX), (long)(dimY), NumNeighb, lambdaReg);  /* NLM - TV penalty */
+             if (switchM == 1) {
+               NLM_TV_2D(Output, A_orig, H_j, H_i, Weights, i, j, (long)(dimX), (long)(dimY), NumNeighb, lambdaReg);  /* NLM - TV penalty */
+             }
+             else {
+               NLM_TV_2D(Output, A_orig, H_i, H_j, Weights, i, j, (long)(dimX), (long)(dimY), NumNeighb, lambdaReg);  /* NLM - TV penalty */
+             }
            }}
           }
-    }  
+    }
     else {
      /*****3D INPUT *****/
         copyIm(A_orig, Output, (long)(dimX), (long)(dimY), (long)(dimZ));
     /* for each pixel store indeces of the most similar neighbours (patches) */
-     for(iter=0; iter<IterNumb; iter++) {    
+     for(iter=0; iter<IterNumb; iter++) {
 #pragma omp parallel for shared (A_orig, Output, Weights, H_i, H_j, H_k, iter) private(i,j,k)
       for(i=0; i<(long)(dimX); i++) {
-            for(j=0; j<(long)(dimY); j++) {              
+            for(j=0; j<(long)(dimY); j++) {
                for(k=0; k<(long)(dimZ); k++) {
             /* NLM_H1_3D(Output, A_orig, H_i, H_j, H_k, Weights, i, j, k, dimX, dimY, dimZ, NumNeighb, lambdaReg); */ /* NLM - H1 penalty */
-            NLM_TV_3D(Output, A_orig, H_i, H_j, H_k, Weights, i, j, k, (long)(dimX), (long)(dimY), (long)(dimZ), NumNeighb, lambdaReg);   /* NLM - TV penalty */     
-           }}}          
-          }          
+            NLM_TV_3D(Output, A_orig, H_i, H_j, H_k, Weights, i, j, k, (long)(dimX), (long)(dimY), (long)(dimZ), NumNeighb, lambdaReg);   /* NLM - TV penalty */
+           }}}
+          }
     }
     return *Output;
 }
@@ -82,9 +87,9 @@ float Nonlocal_TV_CPU_main(float *A_orig, float *Output, unsigned short *H_i, un
 /***********<<<<Main Function for NLM - H1 penalty>>>>**********/
 float NLM_H1_2D(float *A, float *A_orig, unsigned short *H_i, unsigned short *H_j, float *Weights, long i, long j, long dimX, long dimY, int NumNeighb, float lambdaReg)
 {
-	long x, i1, j1, index, index_m; 
+	long x, i1, j1, index, index_m;
 	float value = 0.0f, normweight  = 0.0f;
-	
+
 	index_m = j*dimX+i;
 	for(x=0; x < NumNeighb; x++) {
 	index =  (dimX*dimY*x) + j*dimX+i;
@@ -99,9 +104,9 @@ float NLM_H1_2D(float *A, float *A_orig, unsigned short *H_i, unsigned short *H_
 /*3D version*/
 float NLM_H1_3D(float *A, float *A_orig, unsigned short *H_i, unsigned short *H_j, unsigned short *H_k, float *Weights, long i, long j, long k, long dimX, long dimY, long dimZ, int NumNeighb, float lambdaReg)
 {
-	long x, i1, j1, k1, index; 
+	long x, i1, j1, k1, index;
 	float value = 0.0f, normweight  = 0.0f;
-	
+
 	for(x=0; x < NumNeighb; x++) {
 	index = dimX*dimY*dimZ*x + (dimX*dimY*k) + j*dimX+i;
 		i1 = H_i[index];
@@ -109,7 +114,7 @@ float NLM_H1_3D(float *A, float *A_orig, unsigned short *H_i, unsigned short *H_
 		k1 = H_k[index];
 		value += A[(dimX*dimY*k1) + j1*dimX+i1]*Weights[index];
 		normweight += Weights[index];
-	}	
+	}
     A[(dimX*dimY*k) + j*dimX+i] = (lambdaReg*A_orig[(dimX*dimY*k) + j*dimX+i] + value)/(lambdaReg + normweight);
     return *A;
 }
@@ -118,37 +123,37 @@ float NLM_H1_3D(float *A, float *A_orig, unsigned short *H_i, unsigned short *H_
 /***********<<<<Main Function for NLM - TV penalty>>>>**********/
 float NLM_TV_2D(float *A, float *A_orig, unsigned short *H_i, unsigned short *H_j, float *Weights, long i, long j, long dimX, long dimY, int NumNeighb, float lambdaReg)
 {
-	long x, i1, j1, index, index_m; 
+	long x, i1, j1, index, index_m;
 	float value = 0.0f, normweight  = 0.0f, NLgrad_magn = 0.0f, NLCoeff;
-	
+
 	 index_m = j*dimX+i;
-		
+
 	for(x=0; x < NumNeighb; x++) {
 		index =  (dimX*dimY*x) + j*dimX+i; /*c*/
 		i1 = H_i[index];
 		j1 = H_j[index];
 		NLgrad_magn += powf((A[j1*dimX+i1] - A[index_m]),2)*Weights[index];
 	}
-  
+
     NLgrad_magn = sqrtf(NLgrad_magn); /*Non Local Gradients Magnitude */
     NLCoeff = 2.0f*(1.0f/(NLgrad_magn + EPS));
-    		
+
     for(x=0; x < NumNeighb; x++) {
 	index =  (dimX*dimY*x) + j*dimX+i; /*c*/
 	i1 = H_i[index];
 	j1 = H_j[index];
         value += A[j1*dimX+i1]*NLCoeff*Weights[index];
         normweight += Weights[index]*NLCoeff;
-    }   		
+    }
     A[index_m] = (lambdaReg*A_orig[index_m] + value)/(lambdaReg + normweight);
     return *A;
 }
 /*3D version*/
 float NLM_TV_3D(float *A, float *A_orig, unsigned short *H_i, unsigned short *H_j, unsigned short *H_k, float *Weights, long i, long j, long k, long dimX, long dimY, long dimZ, int NumNeighb, float lambdaReg)
 {
-	long x, i1, j1, k1, index; 
+	long x, i1, j1, k1, index;
 	float value = 0.0f, normweight  = 0.0f, NLgrad_magn = 0.0f, NLCoeff;
-	
+
 	for(x=0; x < NumNeighb; x++) {
 	index =  dimX*dimY*dimZ*x + (dimX*dimY*k) + j*dimX+i;
 		i1 = H_i[index];
@@ -156,10 +161,10 @@ float NLM_TV_3D(float *A, float *A_orig, unsigned short *H_i, unsigned short *H_
 		k1 = H_k[index];
 	        NLgrad_magn += powf((A[(dimX*dimY*k1) + j1*dimX+i1] - A[(dimX*dimY*k1) + j*dimX+i]),2)*Weights[index];
 	}
-  
+
     NLgrad_magn = sqrtf(NLgrad_magn); /*Non Local Gradients Magnitude */
     NLCoeff = 2.0f*(1.0f/(NLgrad_magn + EPS));
-    		
+
     for(x=0; x < NumNeighb; x++) {
 	index = dimX*dimY*dimZ*x + (dimX*dimY*k) + j*dimX+i;
 	i1 = H_i[index];
@@ -167,7 +172,7 @@ float NLM_TV_3D(float *A, float *A_orig, unsigned short *H_i, unsigned short *H_
 	k1 = H_k[index];
         value += A[(dimX*dimY*k1) + j1*dimX+i1]*NLCoeff*Weights[index];
         normweight += Weights[index]*NLCoeff;
-    }   		
+    }
     A[(dimX*dimY*k) + j*dimX+i] = (lambdaReg*A_orig[(dimX*dimY*k) + j*dimX+i] + value)/(lambdaReg + normweight);
     return *A;
 }
diff --git a/src/Core/regularisers_CPU/Nonlocal_TV_core.h b/src/Core/regularisers_CPU/Nonlocal_TV_core.h
index 6d55101..18f6724 100644
--- a/src/Core/regularisers_CPU/Nonlocal_TV_core.h
+++ b/src/Core/regularisers_CPU/Nonlocal_TV_core.h
@@ -1,11 +1,11 @@
 /*
  * This work is part of the Core Imaging Library developed by
  * Visual Analytics and Imaging System Group of the Science Technology
- * Facilities Council, STFC and Diamond Light Source Ltd. 
+ * Facilities Council, STFC and Diamond Light Source Ltd.
  *
  * Copyright 2017 Daniil Kazantsev
  * Copyright 2017 Srikanth Nagella, Edoardo Pasca
- * Copyright 2018 Diamond Light Source Ltd. 
+ * Copyright 2018 Diamond Light Source Ltd.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -39,19 +39,19 @@
  * 2. AR_i - indeces of i neighbours
  * 3. AR_j - indeces of j neighbours
  * 4. AR_k - indeces of k neighbours (0 - for 2D case)
- * 5. Weights_ij(k) - associated weights 
+ * 5. Weights_ij(k) - associated weights
  * 6. regularisation parameter
- * 7. iterations number 
- 
+ * 7. iterations number
+
  * Output:
- * 1. denoised image/volume 	
- * Elmoataz, Abderrahim, Olivier Lezoray, and Sébastien Bougleux. "Nonlocal discrete regularization on weighted graphs: a framework for image and manifold processing." IEEE Trans.   Image Processing 17, no. 7 (2008): 1047-1060. 
+ * 1. denoised image/volume
+ * Elmoataz, Abderrahim, Olivier Lezoray, and Sébastien Bougleux. "Nonlocal discrete regularization on weighted graphs: a framework for image and manifold processing." IEEE Trans.   Image Processing 17, no. 7 (2008): 1047-1060.
  */
- 
+
 #ifdef __cplusplus
 extern "C" {
 #endif
-CCPI_EXPORT float Nonlocal_TV_CPU_main(float *A_orig, float *Output, unsigned short *H_i, unsigned short *H_j, unsigned short *H_k, float *Weights, int dimX, int dimY, int dimZ, int NumNeighb, float lambdaReg, int IterNumb);
+CCPI_EXPORT float Nonlocal_TV_CPU_main(float *A_orig, float *Output, unsigned short *H_i, unsigned short *H_j, unsigned short *H_k, float *Weights, int dimX, int dimY, int dimZ, int NumNeighb, float lambdaReg, int IterNumb, int switchM);
 CCPI_EXPORT float NLM_H1_2D(float *A, float *A_orig, unsigned short *H_i, unsigned short *H_j, float *Weights, long i, long j, long dimX, long dimY, int NumNeighb, float lambdaReg);
 CCPI_EXPORT float NLM_TV_2D(float *A, float *A_orig, unsigned short *H_i, unsigned short *H_j, float *Weights, long i, long j, long dimX, long dimY, int NumNeighb, float lambdaReg);
 CCPI_EXPORT float NLM_H1_3D(float *A, float *A_orig, unsigned short *H_i, unsigned short *H_j, unsigned short *H_k, float *Weights, long i, long j, long k, long dimX, long dimY, long dimZ, int NumNeighb, float lambdaReg);
diff --git a/src/Core/regularisers_CPU/PatchSelect_core.c b/src/Core/regularisers_CPU/PatchSelect_core.c
index cf5cdc7..8581797 100644
--- a/src/Core/regularisers_CPU/PatchSelect_core.c
+++ b/src/Core/regularisers_CPU/PatchSelect_core.c
@@ -1,11 +1,11 @@
 /*
  * This work is part of the Core Imaging Library developed by
  * Visual Analytics and Imaging System Group of the Science Technology
- * Facilities Council, STFC and Diamond Light Source Ltd. 
+ * Facilities Council, STFC and Diamond Light Source Ltd.
  *
  * Copyright 2017 Daniil Kazantsev
  * Copyright 2017 Srikanth Nagella, Edoardo Pasca
- * Copyright 2018 Diamond Light Source Ltd. 
+ * Copyright 2018 Diamond Light Source Ltd.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -44,27 +44,27 @@
  * 4. Weights_ijk - associated weights
  */
 
-void swap(float *xp, float *yp) 
-{ 
-    float temp = *xp; 
-    *xp = *yp; 
-    *yp = temp; 
-} 
+void swap(float *xp, float *yp)
+{
+    float temp = *xp;
+    *xp = *yp;
+    *yp = temp;
+}
 
-void swapUS(unsigned short *xp, unsigned short *yp) 
-{ 
-    unsigned short temp = *xp; 
-    *xp = *yp; 
-    *yp = temp; 
-} 
+void swapUS(unsigned short *xp, unsigned short *yp)
+{
+    unsigned short temp = *xp;
+    *xp = *yp;
+    *yp = temp;
+}
 /**************************************************/
 
-float PatchSelect_CPU_main(float *A, unsigned short *H_i, unsigned short *H_j, unsigned short *H_k, float *Weights, int dimX, int dimY, int dimZ, int SearchWindow, int SimilarWin, int NumNeighb, float h, int switchM)
+float PatchSelect_CPU_main(float *A, unsigned short *H_i, unsigned short *H_j, unsigned short *H_k, float *Weights, int dimX, int dimY, int dimZ, int SearchWindow, int SimilarWin, int NumNeighb, float h)
 {
     int counterG;
     long i, j, k;
     float *Eucl_Vec, h2;
-    h2 = h*h;   
+    h2 = h*h;
     /****************2D INPUT ***************/
     if (dimZ == 0) {
         /* generate a 2D Gaussian kernel for NLM procedure */
@@ -76,23 +76,14 @@ float PatchSelect_CPU_main(float *A, unsigned short *H_i, unsigned short *H_j, u
                 counterG++;
             }} /*main neighb loop */
         /* for each pixel store indeces of the most similar neighbours (patches) */
-        if (switchM == 1) {
-#pragma omp parallel for shared (A, Weights, H_i, H_j) private(i,j)
-    for(i=0; i<(long)(dimX); i++) {
-          for(j=0; j<(long)(dimY); j++) {
-                Indeces2D_p(A, H_i, H_j, Weights, i, j, (long)(dimX), (long)(dimY), Eucl_Vec, NumNeighb, SearchWindow, SimilarWin, h2);
-            }}
-        }
-        else {
 #pragma omp parallel for shared (A, Weights, H_i, H_j) private(i,j)
     for(i=0; i<(long)(dimX); i++) {
           for(j=0; j<(long)(dimY); j++) {
                 Indeces2D(A, H_i, H_j, Weights, i, j, (long)(dimX), (long)(dimY), Eucl_Vec, NumNeighb, SearchWindow, SimilarWin, h2);
             }}
-            }
     }
     else {
-    /****************3D INPUT ***************/       
+    /****************3D INPUT ***************/
         /* generate a 3D Gaussian kernel for NLM procedure */
         Eucl_Vec = (float*) calloc ((2*SimilarWin+1)*(2*SimilarWin+1)*(2*SimilarWin+1),sizeof(float));
         counterG = 0;
@@ -101,25 +92,15 @@ float PatchSelect_CPU_main(float *A, unsigned short *H_i, unsigned short *H_j, u
                 for(k=-SimilarWin; k<=SimilarWin; k++) {
                     Eucl_Vec[counterG] = (float)exp(-(pow(((float) i), 2) + pow(((float) j), 2) + pow(((float) k), 2))/(2*SimilarWin*SimilarWin*SimilarWin));
                     counterG++;
-                }}} /*main neighb loop */     
-        
+                }}} /*main neighb loop */
+
         /* for each voxel store indeces of the most similar neighbours (patches) */
-        if (switchM == 1) {
 #pragma omp parallel for shared (A, Weights, H_i, H_j, H_k) private(i,j,k)
         for(i=0; i<dimX; i++) {
             for(j=0; j<dimY; j++) {
                 for(k=0; k<dimZ; k++) {
-                    Indeces3D(A, H_i, H_j, H_k, Weights, j, i, (k), (dimX), (dimY), (dimZ), Eucl_Vec, NumNeighb, SearchWindow, SimilarWin, h2);
+                    Indeces3D(A, H_i, H_j, H_k, Weights, i, j, k, (long)(dimX), (long)(dimY), (long)(dimZ), Eucl_Vec, NumNeighb, SearchWindow, SimilarWin, h2);
                 }}}
-        }
-        else {
-#pragma omp parallel for shared (A, Weights, H_i, H_j, H_k) private(i,j,k)
-        for(i=0; i<dimX; i++) {
-            for(j=0; j<dimY; j++) {
-                for(k=0; k<dimZ; k++) {
-                    Indeces3D(A, H_i, H_j, H_k, Weights, (i), (j), (k), (dimX), (dimY), (dimZ), Eucl_Vec, NumNeighb, SearchWindow, SimilarWin, h2);
-                }}}
-            }
     }
     free(Eucl_Vec);
     return 1;
@@ -130,78 +111,13 @@ float Indeces2D(float *Aorig, unsigned short *H_i, unsigned short *H_j, float *W
     long i1, j1, i_m, j_m, i_c, j_c, i2, j2, i3, j3, counter, x, y, index, sizeWin_tot, counterG;
     float *Weight_Vec, normsum;
     unsigned short *ind_i, *ind_j;
-    
-    sizeWin_tot = (2*SearchWindow + 1)*(2*SearchWindow + 1);
-    
-    Weight_Vec = (float*) calloc(sizeWin_tot, sizeof(float));
-    ind_i = (unsigned short*) calloc(sizeWin_tot, sizeof(unsigned short));
-    ind_j = (unsigned short*) calloc(sizeWin_tot, sizeof(unsigned short));
-    
-    counter = 0;
-    for(i_m=-SearchWindow; i_m<=SearchWindow; i_m++) {
-        for(j_m=-SearchWindow; j_m<=SearchWindow; j_m++) {
-            i1 = i+i_m;
-            j1 = j+j_m;
-            if (((i1 >= 0) && (i1 < dimX)) && ((j1 >= 0) && (j1 < dimY))) {
-                normsum = 0.0f; counterG = 0;
-                for(i_c=-SimilarWin; i_c<=SimilarWin; i_c++) {
-                    for(j_c=-SimilarWin; j_c<=SimilarWin; j_c++) {
-                        i2 = i1 + i_c;
-                        j2 = j1 + j_c;
-                        i3 = i + i_c;
-                        j3 = j + j_c;
-                        if (((i2 >= 0) && (i2 < dimX)) && ((j2 >= 0) && (j2 < dimY))) {
-                            if (((i3 >= 0) && (i3 < dimX)) && ((j3 >= 0) && (j3 < dimY))) {
-                                normsum += Eucl_Vec[counterG]*pow(Aorig[j3*dimX + (i3)] - Aorig[j2*dimX + (i2)], 2);
-                                counterG++;
-                            }}
-                        
-                    }}
-                /* writing temporarily into vectors */
-                if (normsum > EPS) {                    
-                    Weight_Vec[counter] = expf(-normsum/h2);
-                    ind_i[counter] = i1;
-                    ind_j[counter] = j1;                    
-                    counter++;
-                }
-            }
-        }}     
-    /* do sorting to choose the most prominent weights [HIGH to LOW] */
-    /* and re-arrange indeces accordingly */
-    for (x = 0; x < counter-1; x++)  {
-       for (y = 0; y < counter-x-1; y++)  {
-           if (Weight_Vec[y] < Weight_Vec[y+1]) {
-            swap(&Weight_Vec[y], &Weight_Vec[y+1]); 		
-            swapUS(&ind_i[y], &ind_i[y+1]);
-            swapUS(&ind_j[y], &ind_j[y+1]);  
-            }
-    	}
-    }
-     /*sorting loop finished*/      
-    /*now select the NumNeighb more prominent weights and store into pre-allocated arrays */ 
-    for(x=0; x < NumNeighb; x++) {
-        index = (dimX*dimY*x) + j*dimX+i;        
-        H_i[index] = ind_i[x];
-        H_j[index] = ind_j[x];
-        Weights[index] = Weight_Vec[x];
-    }    
-    free(ind_i);
-    free(ind_j);
-    free(Weight_Vec);
-    return 1;
-}
-float Indeces2D_p(float *Aorig, unsigned short *H_i, unsigned short *H_j, float *Weights, long i, long j, long dimX, long dimY, float *Eucl_Vec, int NumNeighb, int SearchWindow, int SimilarWin, float h2)
-{
-    long i1, j1, i_m, j_m, i_c, j_c, i2, j2, i3, j3, counter, x, y, index, sizeWin_tot, counterG;
-    float *Weight_Vec, normsum;
-    unsigned short *ind_i, *ind_j;
-    
+
     sizeWin_tot = (2*SearchWindow + 1)*(2*SearchWindow + 1);
-    
+
     Weight_Vec = (float*) calloc(sizeWin_tot, sizeof(float));
     ind_i = (unsigned short*) calloc(sizeWin_tot, sizeof(unsigned short));
     ind_j = (unsigned short*) calloc(sizeWin_tot, sizeof(unsigned short));
-    
+
     counter = 0;
     for(i_m=-SearchWindow; i_m<=SearchWindow; i_m++) {
         for(j_m=-SearchWindow; j_m<=SearchWindow; j_m++) {
@@ -217,11 +133,9 @@ float Indeces2D_p(float *Aorig, unsigned short *H_i, unsigned short *H_j, float
                         j3 = j + j_c;
                         if (((i2 >= 0) && (i2 < dimX)) && ((j2 >= 0) && (j2 < dimY))) {
                             if (((i3 >= 0) && (i3 < dimX)) && ((j3 >= 0) && (j3 < dimY))) {
-                                //normsum += Eucl_Vec[counterG]*pow(Aorig[j3*dimX + (i3)] - Aorig[j2*dimX + (i2)], 2);
-                                normsum += Eucl_Vec[counterG]*pow(Aorig[i3*dimY + (j3)] - Aorig[i2*dimY + (j2)], 2);
+                                normsum += Eucl_Vec[counterG]*powf(Aorig[j3*dimX + (i3)] - Aorig[j2*dimX + (i2)], 2);
                                 counterG++;
                             }}
-                        
                     }}
                 /* writing temporarily into vectors */
                 if (normsum > EPS) {
@@ -232,26 +146,25 @@ float Indeces2D_p(float *Aorig, unsigned short *H_i, unsigned short *H_j, float
                 }
             }
         }}
-       /* do sorting to choose the most prominent weights [HIGH to LOW] */
+    /* do sorting to choose the most prominent weights [HIGH to LOW] */
     /* and re-arrange indeces accordingly */
     for (x = 0; x < counter-1; x++)  {
        for (y = 0; y < counter-x-1; y++)  {
            if (Weight_Vec[y] < Weight_Vec[y+1]) {
-            swap(&Weight_Vec[y], &Weight_Vec[y+1]); 		
+            swap(&Weight_Vec[y], &Weight_Vec[y+1]);
             swapUS(&ind_i[y], &ind_i[y+1]);
-            swapUS(&ind_j[y], &ind_j[y+1]);  
+            swapUS(&ind_j[y], &ind_j[y+1]);
             }
     	}
     }
-    /*sorting loop finished*/
-    
-    /*now select the NumNeighb more prominent weights and store into pre-allocated arrays */ 
+     /*sorting loop finished*/
+    /*now select the NumNeighb more prominent weights and store into pre-allocated arrays */
     for(x=0; x < NumNeighb; x++) {
-        index = (dimX*dimY*x) + i*dimY+j;       
+        index = (dimX*dimY*x) + j*dimX+i;
         H_i[index] = ind_i[x];
         H_j[index] = ind_j[x];
         Weights[index] = Weight_Vec[x];
-    }   
+    }
     free(ind_i);
     free(ind_j);
     free(Weight_Vec);
@@ -263,14 +176,14 @@ float Indeces3D(float *Aorig, unsigned short *H_i, unsigned short *H_j, unsigned
     long i1, j1, k1, i_m, j_m, k_m, i_c, j_c, k_c, i2, j2, k2, i3, j3, k3, counter, x, y, index, sizeWin_tot, counterG;
     float *Weight_Vec, normsum, temp;
     unsigned short *ind_i, *ind_j, *ind_k, temp_i, temp_j, temp_k;
-    
+
     sizeWin_tot = (2*SearchWindow + 1)*(2*SearchWindow + 1)*(2*SearchWindow + 1);
-    
+
     Weight_Vec = (float*) calloc(sizeWin_tot, sizeof(float));
     ind_i = (unsigned short*) calloc(sizeWin_tot, sizeof(unsigned short));
     ind_j = (unsigned short*) calloc(sizeWin_tot, sizeof(unsigned short));
     ind_k = (unsigned short*) calloc(sizeWin_tot, sizeof(unsigned short));
-    
+
     counter = 0l;
     for(i_m=-SearchWindow; i_m<=SearchWindow; i_m++) {
         for(j_m=-SearchWindow; j_m<=SearchWindow; j_m++) {
@@ -324,22 +237,21 @@ float Indeces3D(float *Aorig, unsigned short *H_i, unsigned short *H_j, unsigned
                 ind_k[y] = temp_k;
             }}}
     /*sorting loop finished*/
-    
+
     /*now select the NumNeighb more prominent weights and store into arrays */
     for(x=0; x < NumNeighb; x++) {
         index = dimX*dimY*dimZ*x + (dimX*dimY*k) + j*dimX+i;
-        
+
         H_i[index] = ind_i[x];
         H_j[index] = ind_j[x];
         H_k[index] = ind_k[x];
-        
+
         Weights[index] = Weight_Vec[x];
     }
-    
+
     free(ind_i);
     free(ind_j);
     free(ind_k);
     free(Weight_Vec);
     return 1;
 }
-
diff --git a/src/Core/regularisers_CPU/PatchSelect_core.h b/src/Core/regularisers_CPU/PatchSelect_core.h
index ddaa428..ea18c26 100644
--- a/src/Core/regularisers_CPU/PatchSelect_core.h
+++ b/src/Core/regularisers_CPU/PatchSelect_core.h
@@ -1,11 +1,11 @@
 /*
  * This work is part of the Core Imaging Library developed by
  * Visual Analytics and Imaging System Group of the Science Technology
- * Facilities Council, STFC and Diamond Light Source Ltd. 
+ * Facilities Council, STFC and Diamond Light Source Ltd.
  *
  * Copyright 2017 Daniil Kazantsev
  * Copyright 2017 Srikanth Nagella, Edoardo Pasca
- * Copyright 2018 Diamond Light Source Ltd. 
+ * Copyright 2018 Diamond Light Source Ltd.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -54,9 +54,8 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
-CCPI_EXPORT float PatchSelect_CPU_main(float *A, unsigned short *H_i, unsigned short *H_j, unsigned short *H_k, float *Weights, int dimX, int dimY, int dimZ, int SearchWindow, int SimilarWin, int NumNeighb, float h, int switchM);
+CCPI_EXPORT float PatchSelect_CPU_main(float *A, unsigned short *H_i, unsigned short *H_j, unsigned short *H_k, float *Weights, int dimX, int dimY, int dimZ, int SearchWindow, int SimilarWin, int NumNeighb, float h);
 CCPI_EXPORT float Indeces2D(float *Aorig, unsigned short *H_i, unsigned short *H_j, float *Weights, long i, long j, long dimX, long dimY, float *Eucl_Vec, int NumNeighb, int SearchWindow, int SimilarWin, float h2);
-CCPI_EXPORT float Indeces2D_p(float *Aorig, unsigned short *H_i, unsigned short *H_j, float *Weights, long i, long j, long dimX, long dimY, float *Eucl_Vec, int NumNeighb, int SearchWindow, int SimilarWin, float h2);
 CCPI_EXPORT float Indeces3D(float *Aorig, unsigned short *H_i, unsigned short *H_j, unsigned short *H_k, float *Weights, long i, long j, long k, long dimY, long dimX, long dimZ, float *Eucl_Vec, int NumNeighb, int SearchWindow, int SimilarWin, float h2);
 #ifdef __cplusplus
 }
diff --git a/src/Core/regularisers_GPU/PatchSelect_GPU_core.cu b/src/Core/regularisers_GPU/PatchSelect_GPU_core.cu
index 98c8488..2cd27ff 100644
--- a/src/Core/regularisers_GPU/PatchSelect_GPU_core.cu
+++ b/src/Core/regularisers_GPU/PatchSelect_GPU_core.cu
@@ -1,11 +1,11 @@
 /*
  * This work is part of the Core Imaging Library developed by
  * Visual Analytics and Imaging System Group of the Science Technology
- * Facilities Council, STFC and Diamond Light Source Ltd. 
+ * Facilities Council, STFC and Diamond Light Source Ltd.
  *
  * Copyright 2017 Daniil Kazantsev
  * Copyright 2017 Srikanth Nagella, Edoardo Pasca
- * Copyright 2018 Diamond Light Source Ltd. 
+ * Copyright 2018 Diamond Light Source Ltd.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -51,35 +51,35 @@
 #define CONSTVECSIZE11 529
 #define CONSTVECSIZE13 729
 
-__device__ void swap(float *xp, float *yp) 
+__device__ void swap(float *xp, float *yp)
 {
-    float temp = *xp; 
-    *xp = *yp; 
-    *yp = temp; 
+    float temp = *xp;
+    *xp = *yp;
+    *yp = temp;
 }
-__device__ void swapUS(unsigned short *xp, unsigned short *yp) 
-{ 
-    unsigned short temp = *xp; 
-    *xp = *yp; 
-    *yp = temp; 
+__device__ void swapUS(unsigned short *xp, unsigned short *yp)
+{
+    unsigned short temp = *xp;
+    *xp = *yp;
+    *yp = temp;
 }
 
 /********************************************************************************/
 __global__ void IndexSelect2D_5_kernel(float *Ad, unsigned short *H_i_d, unsigned short *H_j_d, float *Weights_d, float *Eucl_Vec_d, int N, int M, int SearchWindow, int SearchW_full, int SimilarWin, int NumNeighb, float h2)
-{          
+{
 
     long i1, j1, i_m, j_m, i_c, j_c, i2, j2, i3, j3, counter, x, y, counterG, index2;
     float normsum;
-    
+
     float Weight_Vec[CONSTVECSIZE5];
     unsigned short ind_i[CONSTVECSIZE5];
     unsigned short ind_j[CONSTVECSIZE5];
 
     int i = blockDim.x * blockIdx.x + threadIdx.x;
     int j = blockDim.y * blockIdx.y + threadIdx.y;
-    
-    long index = i*M+j;      
-    
+
+    long index = i*M+j;
+
     counter = 0;
     for(i_m=-SearchWindow; i_m<=SearchWindow; i_m++) {
         for(j_m=-SearchWindow; j_m<=SearchWindow; j_m++) {
@@ -95,9 +95,9 @@ __global__ void IndexSelect2D_5_kernel(float *Ad, unsigned short *H_i_d, unsigne
                         j3 = j + j_c;
                         if (((i2 >= 0) && (i2 < N)) && ((j2 >= 0) && (j2 < M))) {
                             if (((i3 >= 0) && (i3 < N)) && ((j3 >= 0) && (j3 < M))) {
-                                normsum += Eucl_Vec_d[counterG]*powf(Ad[i3*M + j3] - Ad[i2*M + j2], 2);                                
+                                normsum += Eucl_Vec_d[counterG]*powf(Ad[i3*M + j3] - Ad[i2*M + j2], 2);
                                 counterG++;
-                            }}                        
+                            }}
                      }}
                 /* writing temporarily into vectors */
                 if (normsum > EPS) {
@@ -108,43 +108,43 @@ __global__ void IndexSelect2D_5_kernel(float *Ad, unsigned short *H_i_d, unsigne
                 }
              }
         }}
-        
+
     /* do sorting to choose the most prominent weights [HIGH to LOW] */
     /* and re-arrange indeces accordingly */
     for (x = 0; x < counter-1; x++)  {
        for (y = 0; y < counter-x-1; y++)  {
            if (Weight_Vec[y] < Weight_Vec[y+1]) {
-            swap(&Weight_Vec[y], &Weight_Vec[y+1]); 		                       
+            swap(&Weight_Vec[y], &Weight_Vec[y+1]);
             swapUS(&ind_i[y], &ind_i[y+1]);
-            swapUS(&ind_j[y], &ind_j[y+1]);  
+            swapUS(&ind_j[y], &ind_j[y+1]);
             }
     	}
-    }     
-    /*sorting loop finished*/        
-    /*now select the NumNeighb more prominent weights and store into arrays */     
+    }
+    /*sorting loop finished*/
+    /*now select the NumNeighb more prominent weights and store into arrays */
     for(x=0; x < NumNeighb; x++) {
         index2 = (N*M*x) + index;
         H_i_d[index2] = ind_i[x];
         H_j_d[index2] = ind_j[x];
         Weights_d[index2] = Weight_Vec[x];
     }
-} 
+}
 /********************************************************************************/
 __global__ void IndexSelect2D_7_kernel(float *Ad, unsigned short *H_i_d, unsigned short *H_j_d, float *Weights_d, float *Eucl_Vec_d, int N, int M, int SearchWindow, int SearchW_full, int SimilarWin, int NumNeighb, float h2)
-{          
+{
 
     long i1, j1, i_m, j_m, i_c, j_c, i2, j2, i3, j3, counter, x, y, counterG, index2;
     float normsum;
-    
+
     float Weight_Vec[CONSTVECSIZE7];
     unsigned short ind_i[CONSTVECSIZE7];
     unsigned short ind_j[CONSTVECSIZE7];
 
     int i = blockDim.x * blockIdx.x + threadIdx.x;
     int j = blockDim.y * blockIdx.y + threadIdx.y;
-    
-    long index = i*M+j;      
-    
+
+    long index = i*M+j;
+
     counter = 0;
     for(i_m=-SearchWindow; i_m<=SearchWindow; i_m++) {
         for(j_m=-SearchWindow; j_m<=SearchWindow; j_m++) {
@@ -160,9 +160,9 @@ __global__ void IndexSelect2D_7_kernel(float *Ad, unsigned short *H_i_d, unsigne
                         j3 = j + j_c;
                         if (((i2 >= 0) && (i2 < N)) && ((j2 >= 0) && (j2 < M))) {
                             if (((i3 >= 0) && (i3 < N)) && ((j3 >= 0) && (j3 < M))) {
-                                normsum += Eucl_Vec_d[counterG]*powf(Ad[i3*M + j3] - Ad[i2*M + j2], 2);                                
+                                normsum += Eucl_Vec_d[counterG]*powf(Ad[i3*M + j3] - Ad[i2*M + j2], 2);
                                 counterG++;
-                            }}                        
+                            }}
                      }}
                 /* writing temporarily into vectors */
                 if (normsum > EPS) {
@@ -173,20 +173,20 @@ __global__ void IndexSelect2D_7_kernel(float *Ad, unsigned short *H_i_d, unsigne
                 }
              }
         }}
-        
+
     /* do sorting to choose the most prominent weights [HIGH to LOW] */
     /* and re-arrange indeces accordingly */
     for (x = 0; x < counter-1; x++)  {
        for (y = 0; y < counter-x-1; y++)  {
            if (Weight_Vec[y] < Weight_Vec[y+1]) {
-            swap(&Weight_Vec[y], &Weight_Vec[y+1]); 		                       
+            swap(&Weight_Vec[y], &Weight_Vec[y+1]);
             swapUS(&ind_i[y], &ind_i[y+1]);
-            swapUS(&ind_j[y], &ind_j[y+1]);  
+            swapUS(&ind_j[y], &ind_j[y+1]);
             }
     	}
-    }     
-    /*sorting loop finished*/        
-    /*now select the NumNeighb more prominent weights and store into arrays */     
+    }
+    /*sorting loop finished*/
+    /*now select the NumNeighb more prominent weights and store into arrays */
     for(x=0; x < NumNeighb; x++) {
         index2 = (N*M*x) + index;
         H_i_d[index2] = ind_i[x];
@@ -195,39 +195,47 @@ __global__ void IndexSelect2D_7_kernel(float *Ad, unsigned short *H_i_d, unsigne
     }
 }
 __global__ void IndexSelect2D_9_kernel(float *Ad, unsigned short *H_i_d, unsigned short *H_j_d, float *Weights_d, float *Eucl_Vec_d, int N, int M, int SearchWindow, int SearchW_full, int SimilarWin, int NumNeighb, float h2)
-{          
+{
 
-    long i1, j1, i_m, j_m, i_c, j_c, i2, j2, i3, j3, counter, x, y, counterG, index2;
+    long i1, j1, i_m, j_m, i_c, j_c, i2, j2, i3, j3, counter, x, y, counterG, index2, ind;
     float normsum;
-  
+
     float Weight_Vec[CONSTVECSIZE9];
     unsigned short ind_i[CONSTVECSIZE9];
     unsigned short ind_j[CONSTVECSIZE9];
 
+    for(ind=0; ind<CONSTVECSIZE9; ind++) {
+      Weight_Vec[ind] = 0.0;
+      ind_i[ind] = 0;
+      ind_j[ind] = 0; }
+
     int i = blockDim.x * blockIdx.x + threadIdx.x;
     int j = blockDim.y * blockIdx.y + threadIdx.y;
-    
-    long index = i*M+j;      
-    
+
+    long index = i + N*j;
+
     counter = 0;
     for(i_m=-SearchWindow; i_m<=SearchWindow; i_m++) {
+      i1 = i+i_m;
+      if ((i1 >= 0) && (i1 < N)) {
         for(j_m=-SearchWindow; j_m<=SearchWindow; j_m++) {
-            i1 = i+i_m;
             j1 = j+j_m;
-            if (((i1 >= 0) && (i1 < N)) && ((j1 >= 0) && (j1 < M))) {
+            if ((j1 >= 0) && (j1 < M)) {
                 normsum = 0.0f; counterG = 0;
                 for(i_c=-SimilarWin; i_c<=SimilarWin; i_c++) {
+                  i2 = i1 + i_c;
+                  i3 = i + i_c;
+                  //if ((i2 >= 0) && (i2 < N) && (i3 >= 0) && (i3 < N)) {
                     for(j_c=-SimilarWin; j_c<=SimilarWin; j_c++) {
-                        i2 = i1 + i_c;
                         j2 = j1 + j_c;
-                        i3 = i + i_c;
                         j3 = j + j_c;
-                        if (((i2 >= 0) && (i2 < N)) && ((j2 >= 0) && (j2 < M))) {
-                            if (((i3 >= 0) && (i3 < N)) && ((j3 >= 0) && (j3 < M))) {
-                                normsum += Eucl_Vec_d[counterG]*powf(Ad[i3*M + j3] - Ad[i2*M + j2], 2);
+                        //if ((j2 >= 0) && (j2 < M) && (j3 >= 0) && (j3 < M)) {
+                                normsum += Eucl_Vec_d[counterG]*powf(Ad[i3 + N*j3] - Ad[i2 + N*j2], 2);
                                 counterG++;
-                            }}                        
-                     }}
+                          //    } /*if j2 j3*/
+                          }
+                    // } /*if i2 i3*/
+                   }
                 /* writing temporarily into vectors */
                 if (normsum > EPS) {
                     Weight_Vec[counter] = expf(-normsum/h2);
@@ -235,44 +243,46 @@ __global__ void IndexSelect2D_9_kernel(float *Ad, unsigned short *H_i_d, unsigne
                     ind_j[counter] = j1;
                     counter++;
                 }
+              } /*if j1*/
             }
-        }}
-        
+          } /*if i1*/
+        }
+
     /* do sorting to choose the most prominent weights [HIGH to LOW] */
     /* and re-arrange indeces accordingly */
     for (x = 0; x < counter-1; x++)  {
        for (y = 0; y < counter-x-1; y++)  {
            if (Weight_Vec[y] < Weight_Vec[y+1]) {
-            swap(&Weight_Vec[y], &Weight_Vec[y+1]); 		                       
+            swap(&Weight_Vec[y], &Weight_Vec[y+1]);
             swapUS(&ind_i[y], &ind_i[y+1]);
-            swapUS(&ind_j[y], &ind_j[y+1]);  
+            swapUS(&ind_j[y], &ind_j[y+1]);
             }
     	}
-    }     
-    /*sorting loop finished*/        
-    /*now select the NumNeighb more prominent weights and store into arrays */     
+    }
+    /*sorting loop finished*/
+    /*now select the NumNeighb more prominent weights and store into arrays */
     for(x=0; x < NumNeighb; x++) {
         index2 = (N*M*x) + index;
         H_i_d[index2] = ind_i[x];
         H_j_d[index2] = ind_j[x];
         Weights_d[index2] = Weight_Vec[x];
-    }                     
+    }
 }
 __global__ void IndexSelect2D_11_kernel(float *Ad, unsigned short *H_i_d, unsigned short *H_j_d, float *Weights_d, float *Eucl_Vec_d, int N, int M, int SearchWindow, int SearchW_full, int SimilarWin, int NumNeighb, float h2)
-{          
+{
 
     long i1, j1, i_m, j_m, i_c, j_c, i2, j2, i3, j3, counter, x, y, counterG, index2;
     float normsum;
-    
+
     float Weight_Vec[CONSTVECSIZE11];
     unsigned short ind_i[CONSTVECSIZE11];
     unsigned short ind_j[CONSTVECSIZE11];
 
     int i = blockDim.x * blockIdx.x + threadIdx.x;
     int j = blockDim.y * blockIdx.y + threadIdx.y;
-    
-    long index = i*M+j;      
-    
+
+    long index = i*M+j;
+
     counter = 0;
     for(i_m=-SearchWindow; i_m<=SearchWindow; i_m++) {
         for(j_m=-SearchWindow; j_m<=SearchWindow; j_m++) {
@@ -288,9 +298,9 @@ __global__ void IndexSelect2D_11_kernel(float *Ad, unsigned short *H_i_d, unsign
                         j3 = j + j_c;
                         if (((i2 >= 0) && (i2 < N)) && ((j2 >= 0) && (j2 < M))) {
                             if (((i3 >= 0) && (i3 < N)) && ((j3 >= 0) && (j3 < M))) {
-                                normsum += Eucl_Vec_d[counterG]*powf(Ad[i3*M + j3] - Ad[i2*M + j2], 2);                                
+                                normsum += Eucl_Vec_d[counterG]*powf(Ad[i3*M + j3] - Ad[i2*M + j2], 2);
                                 counterG++;
-                            }}                        
+                            }}
                      }}
                 /* writing temporarily into vectors */
                 if (normsum > EPS) {
@@ -301,42 +311,42 @@ __global__ void IndexSelect2D_11_kernel(float *Ad, unsigned short *H_i_d, unsign
                 }
            }
         }}
-        
+
     /* do sorting to choose the most prominent weights [HIGH to LOW] */
     /* and re-arrange indeces accordingly */
     for (x = 0; x < counter-1; x++)  {
        for (y = 0; y < counter-x-1; y++)  {
            if (Weight_Vec[y] < Weight_Vec[y+1]) {
-            swap(&Weight_Vec[y], &Weight_Vec[y+1]); 		                       
+            swap(&Weight_Vec[y], &Weight_Vec[y+1]);
             swapUS(&ind_i[y], &ind_i[y+1]);
-            swapUS(&ind_j[y], &ind_j[y+1]);  
+            swapUS(&ind_j[y], &ind_j[y+1]);
             }
     	}
-    }     
-    /*sorting loop finished*/        
-    /*now select the NumNeighb more prominent weights and store into arrays */     
+    }
+    /*sorting loop finished*/
+    /*now select the NumNeighb more prominent weights and store into arrays */
     for(x=0; x < NumNeighb; x++) {
         index2 = (N*M*x) + index;
         H_i_d[index2] = ind_i[x];
         H_j_d[index2] = ind_j[x];
         Weights_d[index2] = Weight_Vec[x];
     }
-} 
+}
 __global__ void IndexSelect2D_13_kernel(float *Ad, unsigned short *H_i_d, unsigned short *H_j_d, float *Weights_d, float *Eucl_Vec_d, int N, int M, int SearchWindow, int SearchW_full, int SimilarWin, int NumNeighb, float h2)
-{          
+{
 
     long i1, j1, i_m, j_m, i_c, j_c, i2, j2, i3, j3, counter, x, y, counterG, index2;
     float normsum;
-    
+
     float Weight_Vec[CONSTVECSIZE13];
     unsigned short ind_i[CONSTVECSIZE13];
     unsigned short ind_j[CONSTVECSIZE13];
 
     int i = blockDim.x * blockIdx.x + threadIdx.x;
     int j = blockDim.y * blockIdx.y + threadIdx.y;
-    
-    long index = i*M+j;      
-    
+
+    long index = i*M+j;
+
     counter = 0;
     for(i_m=-SearchWindow; i_m<=SearchWindow; i_m++) {
         for(j_m=-SearchWindow; j_m<=SearchWindow; j_m++) {
@@ -352,9 +362,9 @@ __global__ void IndexSelect2D_13_kernel(float *Ad, unsigned short *H_i_d, unsign
                         j3 = j + j_c;
                         if (((i2 >= 0) && (i2 < N)) && ((j2 >= 0) && (j2 < M))) {
                             if (((i3 >= 0) && (i3 < N)) && ((j3 >= 0) && (j3 < M))) {
-                                normsum += Eucl_Vec_d[counterG]*powf(Ad[i3*M + j3] - Ad[i2*M + j2], 2);                                
+                                normsum += Eucl_Vec_d[counterG]*powf(Ad[i3*M + j3] - Ad[i2*M + j2], 2);
                                 counterG++;
-                            }}                        
+                            }}
                      }}
                 /* writing temporarily into vectors */
                 if (normsum > EPS) {
@@ -365,29 +375,29 @@ __global__ void IndexSelect2D_13_kernel(float *Ad, unsigned short *H_i_d, unsign
                 }
              }
         }}
-        
+
     /* do sorting to choose the most prominent weights [HIGH to LOW] */
     /* and re-arrange indeces accordingly */
     for (x = 0; x < counter-1; x++)  {
        for (y = 0; y < counter-x-1; y++)  {
            if (Weight_Vec[y] < Weight_Vec[y+1]) {
-            swap(&Weight_Vec[y], &Weight_Vec[y+1]); 		                       
+            swap(&Weight_Vec[y], &Weight_Vec[y+1]);
             swapUS(&ind_i[y], &ind_i[y+1]);
-            swapUS(&ind_j[y], &ind_j[y+1]);  
+            swapUS(&ind_j[y], &ind_j[y+1]);
             }
     	}
-    }     
-    /*sorting loop finished*/        
-    /*now select the NumNeighb more prominent weights and store into arrays */     
+    }
+    /*sorting loop finished*/
+    /*now select the NumNeighb more prominent weights and store into arrays */
     for(x=0; x < NumNeighb; x++) {
         index2 = (N*M*x) + index;
         H_i_d[index2] = ind_i[x];
         H_j_d[index2] = ind_j[x];
         Weights_d[index2] = Weight_Vec[x];
     }
-} 
+}
+
 
-   
 /*%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*/
 /********************* MAIN HOST FUNCTION ******************/
 /*%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*/
@@ -398,19 +408,19 @@ extern "C" int PatchSelect_GPU_main(float *A, unsigned short *H_i, unsigned shor
     if (deviceCount == 0) {
         fprintf(stderr, "No CUDA devices found\n");
         return -1;
-    }  
-      
+    }
+
     int SearchW_full, SimilW_full, counterG, i, j;
-    float *Ad, *Weights_d, h2, *Eucl_Vec, *Eucl_Vec_d;    
+    float *Ad, *Weights_d, h2, *Eucl_Vec, *Eucl_Vec_d;
     unsigned short *H_i_d, *H_j_d;
     h2 = h*h;
-    
+
     dim3 dimBlock(BLKXSIZE,BLKYSIZE);
-    dim3 dimGrid(idivup(N,BLKXSIZE), idivup(M,BLKYSIZE));    
-       
+    dim3 dimGrid(idivup(N,BLKXSIZE), idivup(M,BLKYSIZE));
+
     SearchW_full = (2*SearchWindow + 1)*(2*SearchWindow + 1); /* the full searching window  size */
     SimilW_full = (2*SimilarWin + 1)*(2*SimilarWin + 1);   /* the full similarity window  size */
-    
+
     /* generate a 2D Gaussian kernel for NLM procedure */
     Eucl_Vec = (float*) calloc (SimilW_full,sizeof(float));
     counterG = 0;
@@ -419,8 +429,8 @@ extern "C" int PatchSelect_GPU_main(float *A, unsigned short *H_i, unsigned shor
               Eucl_Vec[counterG] = (float)exp(-(pow(((float) i), 2) + pow(((float) j), 2))/(2.0*SimilarWin*SimilarWin));
               counterG++;
     }} /*main neighb loop */
-    
-    
+
+
     /*allocate space on the device*/
     checkCudaErrors( cudaMalloc((void**)&Ad, N*M*sizeof(float)) );
     checkCudaErrors( cudaMalloc((void**)&H_i_d, N*M*NumNeighb*sizeof(unsigned short)) );
@@ -430,8 +440,8 @@ extern "C" int PatchSelect_GPU_main(float *A, unsigned short *H_i, unsigned shor
 
     /* copy data from the host to the device */
     checkCudaErrors( cudaMemcpy(Ad,A,N*M*sizeof(float),cudaMemcpyHostToDevice) );
-    checkCudaErrors( cudaMemcpy(Eucl_Vec_d,Eucl_Vec,SimilW_full*sizeof(float),cudaMemcpyHostToDevice) );    
- 
+    checkCudaErrors( cudaMemcpy(Eucl_Vec_d,Eucl_Vec,SimilW_full*sizeof(float),cudaMemcpyHostToDevice) );
+
     /********************** Run CUDA kernel here ********************/
     if (SearchWindow == 5)  IndexSelect2D_5_kernel<<<dimGrid,dimBlock>>>(Ad, H_i_d, H_j_d, Weights_d, Eucl_Vec_d, N, M, SearchWindow, SearchW_full, SimilarWin, NumNeighb, h2);
     else if (SearchWindow == 7)  IndexSelect2D_7_kernel<<<dimGrid,dimBlock>>>(Ad, H_i_d, H_j_d, Weights_d, Eucl_Vec_d, N, M, SearchWindow, SearchW_full, SimilarWin, NumNeighb, h2);
@@ -440,19 +450,19 @@ extern "C" int PatchSelect_GPU_main(float *A, unsigned short *H_i, unsigned shor
     else if (SearchWindow == 13)  IndexSelect2D_13_kernel<<<dimGrid,dimBlock>>>(Ad, H_i_d, H_j_d, Weights_d, Eucl_Vec_d, N, M, SearchWindow, SearchW_full, SimilarWin, NumNeighb, h2);
     else {
     fprintf(stderr, "Select the searching window size from 5, 7, 9, 11 or 13\n");
-        return -1;}    
-    checkCudaErrors(cudaPeekAtLastError() );        
-    checkCudaErrors(cudaDeviceSynchronize());   
-    /***************************************************************/    
-        
+        return -1;}
+    checkCudaErrors(cudaPeekAtLastError() );
+    checkCudaErrors(cudaDeviceSynchronize());
+    /***************************************************************/
+
     checkCudaErrors(cudaMemcpy(H_i, H_i_d, N*M*NumNeighb*sizeof(unsigned short),cudaMemcpyDeviceToHost) );
-    checkCudaErrors(cudaMemcpy(H_j, H_j_d, N*M*NumNeighb*sizeof(unsigned short),cudaMemcpyDeviceToHost) );  
-    checkCudaErrors(cudaMemcpy(Weights, Weights_d, N*M*NumNeighb*sizeof(float),cudaMemcpyDeviceToHost) );   
-    
-    
-    cudaFree(Ad); 
-    cudaFree(H_i_d); 
-    cudaFree(H_j_d);    
+    checkCudaErrors(cudaMemcpy(H_j, H_j_d, N*M*NumNeighb*sizeof(unsigned short),cudaMemcpyDeviceToHost) );
+    checkCudaErrors(cudaMemcpy(Weights, Weights_d, N*M*NumNeighb*sizeof(float),cudaMemcpyDeviceToHost) );
+
+
+    cudaFree(Ad);
+    cudaFree(H_i_d);
+    cudaFree(H_j_d);
     cudaFree(Weights_d);
     cudaFree(Eucl_Vec_d);
     cudaDeviceReset();
diff --git a/src/Matlab/mex_compile/regularisers_CPU/Nonlocal_TV.c b/src/Matlab/mex_compile/regularisers_CPU/Nonlocal_TV.c
index 34b9915..a5fb1df 100644
--- a/src/Matlab/mex_compile/regularisers_CPU/Nonlocal_TV.c
+++ b/src/Matlab/mex_compile/regularisers_CPU/Nonlocal_TV.c
@@ -1,11 +1,11 @@
 /*
  * This work is part of the Core Imaging Library developed by
  * Visual Analytics and Imaging System Group of the Science Technology
- * Facilities Council, STFC and Diamond Light Source Ltd. 
+ * Facilities Council, STFC and Diamond Light Source Ltd.
  *
  * Copyright 2017 Daniil Kazantsev
  * Copyright 2017 Srikanth Nagella, Edoardo Pasca
- * Copyright 2018 Diamond Light Source Ltd. 
+ * Copyright 2018 Diamond Light Source Ltd.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -35,12 +35,12 @@
  * 2. AR_i - indeces of i neighbours
  * 3. AR_j - indeces of j neighbours
  * 4. AR_k - indeces of k neighbours (0 - for 2D case)
- * 5. Weights_ij(k) - associated weights 
+ * 5. Weights_ij(k) - associated weights
  * 6. regularisation parameter
- * 7. iterations number 
- 
+ * 7. iterations number
+
  * Output:
- * 1. denoised image/volume 	
+ * 1. denoised image/volume
  * Elmoataz, Abderrahim, Olivier Lezoray, and Sébastien Bougleux. "Nonlocal discrete regularization on weighted graphs: a framework for image and manifold processing." IEEE Trans. Image Processing 17, no. 7 (2008): 1047-1060.
  */
 
@@ -54,11 +54,11 @@ void mexFunction(
     const mwSize  *dim_array;
     const mwSize *dim_array2;
     float *A_orig, *Output=NULL, *Weights, lambda;
-    
+
     dim_array = mxGetDimensions(prhs[0]);
     dim_array2 = mxGetDimensions(prhs[1]);
     number_of_dims = mxGetNumberOfDimensions(prhs[0]);
-    
+
     /*Handling Matlab input data*/
     A_orig  = (float *) mxGetData(prhs[0]); /* a 2D image or a set of 2D images (3D stack) */
     H_i  = (unsigned short *) mxGetData(prhs[1]); /* indeces of i neighbours */
@@ -67,14 +67,14 @@ void mexFunction(
     Weights = (float *) mxGetData(prhs[4]); /* weights for patches */
     lambda = (float) mxGetScalar(prhs[5]); /* regularisation parameter */
     IterNumb = (int) mxGetScalar(prhs[6]); /* the number of iterations */
- 
-    dimX = dim_array[0]; dimY = dim_array[1]; dimZ = dim_array[2];   
-         
+
+    dimX = dim_array[0]; dimY = dim_array[1]; dimZ = dim_array[2];
+
     /*****2D INPUT *****/
     if (number_of_dims == 2) {
-        dimZ = 0;   
+        dimZ = 0;
         NumNeighb = dim_array2[2];
-        Output = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL));  
+        Output = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL));
         }
     /*****3D INPUT *****/
     /****************************************************/
@@ -82,7 +82,7 @@ void mexFunction(
         NumNeighb = dim_array2[3];
         Output = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
     }
-    
+
     /* run the main function here */
-    Nonlocal_TV_CPU_main(A_orig, Output, H_i, H_j, H_k, Weights, dimX, dimY, dimZ, NumNeighb, lambda, IterNumb);
+    Nonlocal_TV_CPU_main(A_orig, Output, H_i, H_j, H_k, Weights, dimX, dimY, dimZ, NumNeighb, lambda, IterNumb, 0);
 }
diff --git a/src/Matlab/mex_compile/regularisers_CPU/PatchSelect.c b/src/Matlab/mex_compile/regularisers_CPU/PatchSelect.c
index 1acab29..84b08dd 100644
--- a/src/Matlab/mex_compile/regularisers_CPU/PatchSelect.c
+++ b/src/Matlab/mex_compile/regularisers_CPU/PatchSelect.c
@@ -1,11 +1,11 @@
 /*
  * This work is part of the Core Imaging Library developed by
  * Visual Analytics and Imaging System Group of the Science Technology
- * Facilities Council, STFC and Diamond Light Source Ltd. 
+ * Facilities Council, STFC and Diamond Light Source Ltd.
  *
  * Copyright 2017 Daniil Kazantsev
  * Copyright 2017 Srikanth Nagella, Edoardo Pasca
- * Copyright 2018 Diamond Light Source Ltd. 
+ * Copyright 2018 Diamond Light Source Ltd.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -53,14 +53,14 @@ void mexFunction(
     int number_of_dims,  SearchWindow, SimilarWin, NumNeighb;
     mwSize dimX, dimY, dimZ;
     const mwSize *dim_array;
-    unsigned short *H_i=NULL, *H_j=NULL, *H_k=NULL;    
+    unsigned short *H_i=NULL, *H_j=NULL, *H_k=NULL;
     float *A, *Weights = NULL, h;
     mwSize dim_array2[3]; /* for 2D data */
     mwSize dim_array3[4]; /* for 3D data */
-    
+
     dim_array = mxGetDimensions(prhs[0]);
     number_of_dims = mxGetNumberOfDimensions(prhs[0]);
-    
+
     /*Handling Matlab input data*/
     A  = (float *) mxGetData(prhs[0]); /* a 2D or 3D image/volume */
     SearchWindow = (int) mxGetScalar(prhs[1]);    /* Large Searching window */
@@ -71,22 +71,22 @@ void mexFunction(
     dimX = dim_array[0]; dimY = dim_array[1]; dimZ = dim_array[2];
     dim_array2[0] = dimX; dim_array2[1] = dimY; dim_array2[2] = NumNeighb;  /* 2D case */
     dim_array3[0] = dimX; dim_array3[1] = dimY; dim_array3[2] = dimZ; dim_array3[3] = NumNeighb;  /* 3D case */
-    
+
     /****************2D INPUT ***************/
     if (number_of_dims == 2) {
-        dimZ = 0;               
+        dimZ = 0;
         H_i = (unsigned short*)mxGetPr(plhs[0] = mxCreateNumericArray(3, dim_array2, mxUINT16_CLASS, mxREAL));
         H_j = (unsigned short*)mxGetPr(plhs[1] = mxCreateNumericArray(3, dim_array2, mxUINT16_CLASS, mxREAL));
         Weights = (float*)mxGetPr(plhs[2] = mxCreateNumericArray(3, dim_array2, mxSINGLE_CLASS, mxREAL));
         }
     /****************3D INPUT ***************/
-    if (number_of_dims == 3) {        
+    if (number_of_dims == 3) {
         H_i = (unsigned short*)mxGetPr(plhs[0] = mxCreateNumericArray(4, dim_array3, mxUINT16_CLASS, mxREAL));
         H_j = (unsigned short*)mxGetPr(plhs[1] = mxCreateNumericArray(4, dim_array3, mxUINT16_CLASS, mxREAL));
         H_k = (unsigned short*)mxGetPr(plhs[2] = mxCreateNumericArray(4, dim_array3, mxUINT16_CLASS, mxREAL));
-        Weights = (float*)mxGetPr(plhs[3] = mxCreateNumericArray(4, dim_array3, mxSINGLE_CLASS, mxREAL));        
+        Weights = (float*)mxGetPr(plhs[3] = mxCreateNumericArray(4, dim_array3, mxSINGLE_CLASS, mxREAL));
     }
-    
-    PatchSelect_CPU_main(A, H_i, H_j, H_k, Weights, (long)(dimX), (long)(dimY), (long)(dimZ), SearchWindow, SimilarWin, NumNeighb, h, 0); 
-    
+
+    PatchSelect_CPU_main(A, H_i, H_j, H_k, Weights, (long)(dimX), (long)(dimY), (long)(dimZ), SearchWindow, SimilarWin, NumNeighb, h); 
+
  }
diff --git a/src/Python/src/cpu_regularisers.pyx b/src/Python/src/cpu_regularisers.pyx
index 904b4f5..4917d06 100644
--- a/src/Python/src/cpu_regularisers.pyx
+++ b/src/Python/src/cpu_regularisers.pyx
@@ -27,8 +27,8 @@ cdef extern float Diffusion_CPU_main(float *Input, float *Output, float *infovec
 cdef extern float Diffus4th_CPU_main(float *Input, float *Output,  float *infovector, float lambdaPar, float sigmaPar, int iterationsNumb, float tau, float epsil, int dimX, int dimY, int dimZ);
 cdef extern float dTV_FGP_CPU_main(float *Input, float *InputRef, float *Output, float *infovector, float lambdaPar, int iterationsNumb, float epsil, float eta, int methodTV, int nonneg, int dimX, int dimY, int dimZ);
 cdef extern float TNV_CPU_main(float *Input, float *u, float lambdaPar, int maxIter, float tol, int dimX, int dimY, int dimZ);
-cdef extern float PatchSelect_CPU_main(float *Input, unsigned short *H_i, unsigned short *H_j, unsigned short *H_k, float *Weights, int dimX, int dimY, int dimZ, int SearchWindow, int SimilarWin, int NumNeighb, float h, int switchM);
-cdef extern float Nonlocal_TV_CPU_main(float *A_orig, float *Output, unsigned short *H_i, unsigned short *H_j, unsigned short *H_k, float *Weights, int dimX, int dimY, int dimZ, int NumNeighb, float lambdaReg, int IterNumb);
+cdef extern float PatchSelect_CPU_main(float *Input, unsigned short *H_i, unsigned short *H_j, unsigned short *H_k, float *Weights, int dimX, int dimY, int dimZ, int SearchWindow, int SimilarWin, int NumNeighb, float h);
+cdef extern float Nonlocal_TV_CPU_main(float *A_orig, float *Output, unsigned short *H_i, unsigned short *H_j, unsigned short *H_k, float *Weights, int dimX, int dimY, int dimZ, int NumNeighb, float lambdaReg, int IterNumb, int switchM);
 
 cdef extern float Diffusion_Inpaint_CPU_main(float *Input, unsigned char *Mask, float *Output, float lambdaPar, float sigmaPar, int iterationsNumb, float tau, int penaltytype, int dimX, int dimY, int dimZ);
 cdef extern float NonlocalMarching_Inpaint_main(float *Input, unsigned char *M, float *Output, unsigned char *M_upd, int SW_increment, int iterationsNumb, int trigger, int dimX, int dimY, int dimZ);
@@ -570,7 +570,7 @@ def PatchSel_2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData,
             np.zeros([dims[0], dims[1],dims[2]], dtype='uint16')
 
     # Run patch-based weight selection function
-    PatchSelect_CPU_main(&inputData[0,0], &H_j[0,0,0], &H_i[0,0,0], &H_i[0,0,0], &Weights[0,0,0], dims[2], dims[1], 0, searchwindow, patchwindow,  neighbours,  edge_parameter, 1)
+    PatchSelect_CPU_main(&inputData[0,0], &H_j[0,0,0], &H_i[0,0,0], &H_i[0,0,0], &Weights[0,0,0], dims[2], dims[1], 0, searchwindow, patchwindow,  neighbours,  edge_parameter)
     return H_i, H_j, Weights
 """
 def PatchSel_3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData,
@@ -625,7 +625,7 @@ def NLTV_2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData,
             np.zeros([dims[0],dims[1]], dtype='float32')
 
     # Run nonlocal TV regularisation
-    Nonlocal_TV_CPU_main(&inputData[0,0], &outputData[0,0], &H_i[0,0,0], &H_j[0,0,0], &H_i[0,0,0], &Weights[0,0,0], dims[1], dims[0], 0, neighbours, regularisation_parameter, iterations)
+    Nonlocal_TV_CPU_main(&inputData[0,0], &outputData[0,0], &H_i[0,0,0], &H_j[0,0,0], &H_i[0,0,0], &Weights[0,0,0], dims[1], dims[0], 0, neighbours, regularisation_parameter, iterations, 1)
     return outputData
 
 #*********************Inpainting WITH****************************#
-- 
cgit v1.2.3