Merge remote-tracking branch 'origin/master'

author: TomasKulhanek <tomas.kulhanek@stfc.ac.uk> 2018-12-06 14:56:39 +0000
committer: TomasKulhanek <tomas.kulhanek@stfc.ac.uk> 2018-12-06 14:56:39 +0000
commit: c04b85a6fdd8c63e3363c8072cbfe4b97409dc60 (patch)
tree: 0b12e391692854435319ee72199cfa57908bcff8
parent: c4d1fccb8fb4a2beaf9b9cfb8e2f38b9cfeac283 (diff)
parent: 3bce1f1410303a6833d1e647fba9692ea40fa878 (diff)
download: regularization-c04b85a6fdd8c63e3363c8072cbfe4b97409dc60.tar.gz
regularization-c04b85a6fdd8c63e3363c8072cbfe4b97409dc60.tar.bz2
regularization-c04b85a6fdd8c63e3363c8072cbfe4b97409dc60.tar.xz
regularization-c04b85a6fdd8c63e3363c8072cbfe4b97409dc60.zip
15 files changed, 722 insertions, 71 deletions
diff --git a/Core/CMakeLists.txt b/Core/CMakeLists.txt
index df01bb7..d92db82 100644
--- a/Core/CMakeLists.txt
+++ b/Core/CMakeLists.txt
@@ -120,6 +120,7 @@ if (BUILD_CUDA)
         ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/dTV_FGP_GPU_core.cu
         ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/NonlDiff_GPU_core.cu
         ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/Diffus_4thO_GPU_core.cu
+        ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/PatchSelect_GPU_core.cu
       )
       if (UNIX)
         message ("I'd install into ${CMAKE_INSTALL_PREFIX}/lib")
diff --git a/Core/regularisers_CPU/PatchSelect_core.c b/Core/regularisers_CPU/PatchSelect_core.c
index 45cf8c8..cf5cdc7 100644
--- a/Core/regularisers_CPU/PatchSelect_core.c
+++ b/Core/regularisers_CPU/PatchSelect_core.c
@@ -44,6 +44,19 @@
  * 4. Weights_ijk - associated weights
  */
 
+void swap(float *xp, float *yp) 
+{ 
+    float temp = *xp; 
+    *xp = *yp; 
+    *yp = temp; 
+} 
+
+void swapUS(unsigned short *xp, unsigned short *yp) 
+{ 
+    unsigned short temp = *xp; 
+    *xp = *yp; 
+    *yp = temp; 
+} 
 /**************************************************/
 
 float PatchSelect_CPU_main(float *A, unsigned short *H_i, unsigned short *H_j, unsigned short *H_k, float *Weights, int dimX, int dimY, int dimZ, int SearchWindow, int SimilarWin, int NumNeighb, float h, int switchM)
@@ -51,8 +64,7 @@ float PatchSelect_CPU_main(float *A, unsigned short *H_i, unsigned short *H_j, u
     int counterG;
     long i, j, k;
     float *Eucl_Vec, h2;
-    h2 = h*h;
-   
+    h2 = h*h;   
     /****************2D INPUT ***************/
     if (dimZ == 0) {
         /* generate a 2D Gaussian kernel for NLM procedure */
@@ -67,14 +79,14 @@ float PatchSelect_CPU_main(float *A, unsigned short *H_i, unsigned short *H_j, u
         if (switchM == 1) {
 #pragma omp parallel for shared (A, Weights, H_i, H_j) private(i,j)
     for(i=0; i<(long)(dimX); i++) {
-            for(j=0; j<(long)(dimY); j++) {
+          for(j=0; j<(long)(dimY); j++) {
                 Indeces2D_p(A, H_i, H_j, Weights, i, j, (long)(dimX), (long)(dimY), Eucl_Vec, NumNeighb, SearchWindow, SimilarWin, h2);
             }}
         }
         else {
 #pragma omp parallel for shared (A, Weights, H_i, H_j) private(i,j)
     for(i=0; i<(long)(dimX); i++) {
-            for(j=0; j<(long)(dimY); j++) {
+          for(j=0; j<(long)(dimY); j++) {
                 Indeces2D(A, H_i, H_j, Weights, i, j, (long)(dimX), (long)(dimY), Eucl_Vec, NumNeighb, SearchWindow, SimilarWin, h2);
             }}
             }
@@ -116,8 +128,8 @@ float PatchSelect_CPU_main(float *A, unsigned short *H_i, unsigned short *H_j, u
 float Indeces2D(float *Aorig, unsigned short *H_i, unsigned short *H_j, float *Weights, long i, long j, long dimX, long dimY, float *Eucl_Vec, int NumNeighb, int SearchWindow, int SimilarWin, float h2)
 {
     long i1, j1, i_m, j_m, i_c, j_c, i2, j2, i3, j3, counter, x, y, index, sizeWin_tot, counterG;
-    float *Weight_Vec, normsum, temp;
-    unsigned short *ind_i, *ind_j, temp_i, temp_j;
+    float *Weight_Vec, normsum;
+    unsigned short *ind_i, *ind_j;
     
     sizeWin_tot = (2*SearchWindow + 1)*(2*SearchWindow + 1);
     
@@ -146,51 +158,43 @@ float Indeces2D(float *Aorig, unsigned short *H_i, unsigned short *H_j, float *W
                         
                     }}
                 /* writing temporarily into vectors */
-                if (normsum > EPS) {
+                if (normsum > EPS) {                    
                     Weight_Vec[counter] = expf(-normsum/h2);
                     ind_i[counter] = i1;
-                    ind_j[counter] = j1;
+                    ind_j[counter] = j1;                    
                     counter++;
                 }
             }
-        }}
+        }}     
     /* do sorting to choose the most prominent weights [HIGH to LOW] */
     /* and re-arrange indeces accordingly */
-    for (x = 0; x < counter; x++)  {
-        for (y = 0; y < counter; y++)  {
-            if (Weight_Vec[y] < Weight_Vec[x]) {
-                temp = Weight_Vec[y+1];
-                temp_i = ind_i[y+1];
-                temp_j = ind_j[y+1];
-                Weight_Vec[y+1] = Weight_Vec[y];
-                Weight_Vec[y] = temp;
-                ind_i[y+1] = ind_i[y];
-                ind_i[y] = temp_i;
-                ind_j[y+1] = ind_j[y];
-                ind_j[y] = temp_j;
-            }}}
-    /*sorting loop finished*/
-    
-    /*now select the NumNeighb more prominent weights and store into arrays */
+    for (x = 0; x < counter-1; x++)  {
+       for (y = 0; y < counter-x-1; y++)  {
+           if (Weight_Vec[y] < Weight_Vec[y+1]) {
+            swap(&Weight_Vec[y], &Weight_Vec[y+1]); 		
+            swapUS(&ind_i[y], &ind_i[y+1]);
+            swapUS(&ind_j[y], &ind_j[y+1]);  
+            }
+    	}
+    }
+     /*sorting loop finished*/      
+    /*now select the NumNeighb more prominent weights and store into pre-allocated arrays */ 
     for(x=0; x < NumNeighb; x++) {
-        index = (dimX*dimY*x) + j*dimX+i;
+        index = (dimX*dimY*x) + j*dimX+i;        
         H_i[index] = ind_i[x];
         H_j[index] = ind_j[x];
         Weights[index] = Weight_Vec[x];
-    }
-    
+    }    
     free(ind_i);
     free(ind_j);
     free(Weight_Vec);
     return 1;
 }
-
-
 float Indeces2D_p(float *Aorig, unsigned short *H_i, unsigned short *H_j, float *Weights, long i, long j, long dimX, long dimY, float *Eucl_Vec, int NumNeighb, int SearchWindow, int SimilarWin, float h2)
 {
     long i1, j1, i_m, j_m, i_c, j_c, i2, j2, i3, j3, counter, x, y, index, sizeWin_tot, counterG;
-    float *Weight_Vec, normsum, temp;
-    unsigned short *ind_i, *ind_j, temp_i, temp_j;
+    float *Weight_Vec, normsum;
+    unsigned short *ind_i, *ind_j;
     
     sizeWin_tot = (2*SearchWindow + 1)*(2*SearchWindow + 1);
     
@@ -228,32 +232,26 @@ float Indeces2D_p(float *Aorig, unsigned short *H_i, unsigned short *H_j, float
                 }
             }
         }}
-    /* do sorting to choose the most prominent weights [HIGH to LOW] */
+       /* do sorting to choose the most prominent weights [HIGH to LOW] */
     /* and re-arrange indeces accordingly */
-    for (x = 0; x < counter; x++)  {
-        for (y = 0; y < counter; y++)  {
-            if (Weight_Vec[y] < Weight_Vec[x]) {
-                temp = Weight_Vec[y+1];
-                temp_i = ind_i[y+1];
-                temp_j = ind_j[y+1];
-                Weight_Vec[y+1] = Weight_Vec[y];
-                Weight_Vec[y] = temp;
-                ind_i[y+1] = ind_i[y];
-                ind_i[y] = temp_i;
-                ind_j[y+1] = ind_j[y];
-                ind_j[y] = temp_j;
-            }}}
+    for (x = 0; x < counter-1; x++)  {
+       for (y = 0; y < counter-x-1; y++)  {
+           if (Weight_Vec[y] < Weight_Vec[y+1]) {
+            swap(&Weight_Vec[y], &Weight_Vec[y+1]); 		
+            swapUS(&ind_i[y], &ind_i[y+1]);
+            swapUS(&ind_j[y], &ind_j[y+1]);  
+            }
+    	}
+    }
     /*sorting loop finished*/
     
-    /*now select the NumNeighb more prominent weights and store into arrays */
+    /*now select the NumNeighb more prominent weights and store into pre-allocated arrays */ 
     for(x=0; x < NumNeighb; x++) {
-        //index = (dimX*dimY*x) + j*dimX+i;
-        index = (dimX*dimY*x) + i*dimY+j;
+        index = (dimX*dimY*x) + i*dimY+j;       
         H_i[index] = ind_i[x];
         H_j[index] = ind_j[x];
         Weights[index] = Weight_Vec[x];
-    }
-    
+    }   
     free(ind_i);
     free(ind_j);
     free(Weight_Vec);
diff --git a/Core/regularisers_GPU/PatchSelect_GPU_core.cu b/Core/regularisers_GPU/PatchSelect_GPU_core.cu
new file mode 100644
index 0000000..f558b0f
--- /dev/null
+++ b/Core/regularisers_GPU/PatchSelect_GPU_core.cu
@@ -0,0 +1,468 @@
+/*
+ * This work is part of the Core Imaging Library developed by
+ * Visual Analytics and Imaging System Group of the Science Technology
+ * Facilities Council, STFC and Diamond Light Source Ltd. 
+ *
+ * Copyright 2017 Daniil Kazantsev
+ * Copyright 2017 Srikanth Nagella, Edoardo Pasca
+ * Copyright 2018 Diamond Light Source Ltd. 
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "PatchSelect_GPU_core.h"
+
+/* CUDA implementation of non-local weight pre-calculation for non-local priors
+ * Weights and associated indices are stored into pre-allocated arrays and passed
+ * to the regulariser
+ *
+ *
+ * Input Parameters:
+ * 1. 2D grayscale image (classical 3D version will not be supported but rather 2D + dim extension (TODO))
+ * 2. Searching window (half-size of the main bigger searching window, e.g. 11)
+ * 3. Similarity window (half-size of the patch window, e.g. 2)
+ * 4. The number of neighbours to take (the most prominent after sorting neighbours will be taken)
+ * 5. noise-related parameter to calculate non-local weights
+ *
+ * Output [2D]:
+ * 1. AR_i - indeces of i neighbours
+ * 2. AR_j - indeces of j neighbours
+ * 3. Weights_ij - associated weights
+ */
+
+// This will output the proper CUDA error strings in the event that a CUDA host call returns an error
+#define checkCudaErrors(err)           __checkCudaErrors (err, __FILE__, __LINE__)
+
+inline void __checkCudaErrors(cudaError err, const char *file, const int line)
+{
+    if (cudaSuccess != err)
+    {
+        fprintf(stderr, "%s(%i) : CUDA Runtime API error %d: %s.\n",
+                file, line, (int)err, cudaGetErrorString(err));
+        exit(EXIT_FAILURE);
+    }
+}
+
+#define BLKXSIZE 16
+#define BLKYSIZE 16
+#define idivup(a, b) ( ((a)%(b) != 0) ? (a)/(b)+1 : (a)/(b) )
+#define M_PI 3.14159265358979323846
+#define EPS 1.0e-8
+#define CONSTVECSIZE5 121
+#define CONSTVECSIZE7 225
+#define CONSTVECSIZE9 361
+#define CONSTVECSIZE11 529
+#define CONSTVECSIZE13 729
+
+__device__ void swap(float *xp, float *yp) 
+{
+    float temp = *xp; 
+    *xp = *yp; 
+    *yp = temp; 
+}
+__device__ void swapUS(unsigned short *xp, unsigned short *yp) 
+{ 
+    unsigned short temp = *xp; 
+    *xp = *yp; 
+    *yp = temp; 
+}
+
+/********************************************************************************/
+__global__ void IndexSelect2D_5_kernel(float *Ad, unsigned short *H_i_d, unsigned short *H_j_d, float *Weights_d, float *Eucl_Vec_d, int N, int M, int SearchWindow, int SearchW_full, int SimilarWin, int NumNeighb, float h2)
+{          
+
+    long i1, j1, i_m, j_m, i_c, j_c, i2, j2, i3, j3, counter, x, y, counterG, index2;
+    float normsum;
+    
+    float Weight_Vec[CONSTVECSIZE5];
+    unsigned short ind_i[CONSTVECSIZE5];
+    unsigned short ind_j[CONSTVECSIZE5];
+
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    int j = blockDim.y * blockIdx.y + threadIdx.y;
+    
+    long index = i*M+j;      
+    
+    counter = 0;
+    for(i_m=-SearchWindow; i_m<=SearchWindow; i_m++) {
+        for(j_m=-SearchWindow; j_m<=SearchWindow; j_m++) {
+            i1 = i+i_m;
+            j1 = j+j_m;
+              if (((i1 >= 0) && (i1 < N)) && ((j1 >= 0) && (j1 < M))) {
+                normsum = 0.0f; counterG = 0;
+                for(i_c=-SimilarWin; i_c<=SimilarWin; i_c++) {
+                    for(j_c=-SimilarWin; j_c<=SimilarWin; j_c++) {
+                        i2 = i1 + i_c;
+                        j2 = j1 + j_c;
+                        i3 = i + i_c;
+                        j3 = j + j_c;
+                        if (((i2 >= 0) && (i2 < N)) && ((j2 >= 0) && (j2 < M))) {
+                            if (((i3 >= 0) && (i3 < N)) && ((j3 >= 0) && (j3 < M))) {
+                                normsum += Eucl_Vec_d[counterG]*powf(Ad[i3*M + j3] - Ad[i2*M + j2], 2);                                
+                                counterG++;
+                            }}                        
+                     }}
+                /* writing temporarily into vectors */
+                if (normsum > EPS) {
+                    Weight_Vec[counter] = __expf(-normsum/h2);
+                    ind_i[counter] = i1;
+                    ind_j[counter] = j1;
+                    counter++;
+                }
+             }
+        }}
+        
+    /* do sorting to choose the most prominent weights [HIGH to LOW] */
+    /* and re-arrange indeces accordingly */
+    for (x = 0; x < counter-1; x++)  {
+       for (y = 0; y < counter-x-1; y++)  {
+           if (Weight_Vec[y] < Weight_Vec[y+1]) {
+            swap(&Weight_Vec[y], &Weight_Vec[y+1]); 		                       
+            swapUS(&ind_i[y], &ind_i[y+1]);
+            swapUS(&ind_j[y], &ind_j[y+1]);  
+            }
+    	}
+    }     
+    /*sorting loop finished*/        
+    /*now select the NumNeighb more prominent weights and store into arrays */     
+    for(x=0; x < NumNeighb; x++) {
+        index2 = (N*M*x) + index;
+        H_i_d[index2] = ind_i[x];
+        H_j_d[index2] = ind_j[x];
+        Weights_d[index2] = Weight_Vec[x];
+    }
+} 
+/********************************************************************************/
+__global__ void IndexSelect2D_7_kernel(float *Ad, unsigned short *H_i_d, unsigned short *H_j_d, float *Weights_d, float *Eucl_Vec_d, int N, int M, int SearchWindow, int SearchW_full, int SimilarWin, int NumNeighb, float h2)
+{          
+
+    long i1, j1, i_m, j_m, i_c, j_c, i2, j2, i3, j3, counter, x, y, counterG, index2;
+    float normsum;
+    
+    float Weight_Vec[CONSTVECSIZE7];
+    unsigned short ind_i[CONSTVECSIZE7];
+    unsigned short ind_j[CONSTVECSIZE7];
+
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    int j = blockDim.y * blockIdx.y + threadIdx.y;
+    
+    long index = i*M+j;      
+    
+    counter = 0;
+    for(i_m=-SearchWindow; i_m<=SearchWindow; i_m++) {
+        for(j_m=-SearchWindow; j_m<=SearchWindow; j_m++) {
+            i1 = i+i_m;
+            j1 = j+j_m;
+             if (((i1 >= 0) && (i1 < N)) && ((j1 >= 0) && (j1 < M))) {
+                normsum = 0.0f; counterG = 0;
+                for(i_c=-SimilarWin; i_c<=SimilarWin; i_c++) {
+                    for(j_c=-SimilarWin; j_c<=SimilarWin; j_c++) {
+                        i2 = i1 + i_c;
+                        j2 = j1 + j_c;
+                        i3 = i + i_c;
+                        j3 = j + j_c;
+                        if (((i2 >= 0) && (i2 < N)) && ((j2 >= 0) && (j2 < M))) {
+                            if (((i3 >= 0) && (i3 < N)) && ((j3 >= 0) && (j3 < M))) {
+                                normsum += Eucl_Vec_d[counterG]*powf(Ad[i3*M + j3] - Ad[i2*M + j2], 2);                                
+                                counterG++;
+                            }}                        
+                     }}
+                /* writing temporarily into vectors */
+                if (normsum > EPS) {
+                    Weight_Vec[counter] = __expf(-normsum/h2);
+                    ind_i[counter] = i1;
+                    ind_j[counter] = j1;
+                    counter++;
+                }
+             }
+        }}
+        
+    /* do sorting to choose the most prominent weights [HIGH to LOW] */
+    /* and re-arrange indeces accordingly */
+    for (x = 0; x < counter-1; x++)  {
+       for (y = 0; y < counter-x-1; y++)  {
+           if (Weight_Vec[y] < Weight_Vec[y+1]) {
+            swap(&Weight_Vec[y], &Weight_Vec[y+1]); 		                       
+            swapUS(&ind_i[y], &ind_i[y+1]);
+            swapUS(&ind_j[y], &ind_j[y+1]);  
+            }
+    	}
+    }     
+    /*sorting loop finished*/        
+    /*now select the NumNeighb more prominent weights and store into arrays */     
+    for(x=0; x < NumNeighb; x++) {
+        index2 = (N*M*x) + index;
+        H_i_d[index2] = ind_i[x];
+        H_j_d[index2] = ind_j[x];
+        Weights_d[index2] = Weight_Vec[x];
+    }
+}
+__global__ void IndexSelect2D_9_kernel(float *Ad, unsigned short *H_i_d, unsigned short *H_j_d, float *Weights_d, float *Eucl_Vec_d, int N, int M, int SearchWindow, int SearchW_full, int SimilarWin, int NumNeighb, float h2)
+{          
+
+    long i1, j1, i_m, j_m, i_c, j_c, i2, j2, i3, j3, counter, x, y, counterG, index2;
+    float normsum;
+  
+    float Weight_Vec[CONSTVECSIZE9];
+    unsigned short ind_i[CONSTVECSIZE9];
+    unsigned short ind_j[CONSTVECSIZE9];
+
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    int j = blockDim.y * blockIdx.y + threadIdx.y;
+    
+    long index = i*M+j;      
+    
+    counter = 0;
+    for(i_m=-SearchWindow; i_m<=SearchWindow; i_m++) {
+        for(j_m=-SearchWindow; j_m<=SearchWindow; j_m++) {
+            i1 = i+i_m;
+            j1 = j+j_m;
+            if (((i1 >= 0) && (i1 < N)) && ((j1 >= 0) && (j1 < M))) {
+                normsum = 0.0f; counterG = 0;
+                for(i_c=-SimilarWin; i_c<=SimilarWin; i_c++) {
+                    for(j_c=-SimilarWin; j_c<=SimilarWin; j_c++) {
+                        i2 = i1 + i_c;
+                        j2 = j1 + j_c;
+                        i3 = i + i_c;
+                        j3 = j + j_c;
+                        if (((i2 >= 0) && (i2 < N)) && ((j2 >= 0) && (j2 < M))) {
+                            if (((i3 >= 0) && (i3 < N)) && ((j3 >= 0) && (j3 < M))) {
+                                normsum += Eucl_Vec_d[counterG]*powf(Ad[i3*M + j3] - Ad[i2*M + j2], 2);
+                                counterG++;
+                            }}                        
+                     }}
+                /* writing temporarily into vectors */
+                if (normsum > EPS) {
+                    Weight_Vec[counter] = expf(-normsum/h2);
+                    ind_i[counter] = i1;
+                    ind_j[counter] = j1;
+                    counter++;
+                }
+            }
+        }}
+        
+    /* do sorting to choose the most prominent weights [HIGH to LOW] */
+    /* and re-arrange indeces accordingly */
+    for (x = 0; x < counter-1; x++)  {
+       for (y = 0; y < counter-x-1; y++)  {
+           if (Weight_Vec[y] < Weight_Vec[y+1]) {
+            swap(&Weight_Vec[y], &Weight_Vec[y+1]); 		                       
+            swapUS(&ind_i[y], &ind_i[y+1]);
+            swapUS(&ind_j[y], &ind_j[y+1]);  
+            }
+    	}
+    }     
+    /*sorting loop finished*/        
+    /*now select the NumNeighb more prominent weights and store into arrays */     
+    for(x=0; x < NumNeighb; x++) {
+        index2 = (N*M*x) + index;
+        H_i_d[index2] = ind_i[x];
+        H_j_d[index2] = ind_j[x];
+        Weights_d[index2] = Weight_Vec[x];
+    }                     
+}
+__global__ void IndexSelect2D_11_kernel(float *Ad, unsigned short *H_i_d, unsigned short *H_j_d, float *Weights_d, float *Eucl_Vec_d, int N, int M, int SearchWindow, int SearchW_full, int SimilarWin, int NumNeighb, float h2)
+{          
+
+    long i1, j1, i_m, j_m, i_c, j_c, i2, j2, i3, j3, counter, x, y, counterG, index2;
+    float normsum;
+    
+    float Weight_Vec[CONSTVECSIZE11];
+    unsigned short ind_i[CONSTVECSIZE11];
+    unsigned short ind_j[CONSTVECSIZE11];
+
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    int j = blockDim.y * blockIdx.y + threadIdx.y;
+    
+    long index = i*M+j;      
+    
+    counter = 0;
+    for(i_m=-SearchWindow; i_m<=SearchWindow; i_m++) {
+        for(j_m=-SearchWindow; j_m<=SearchWindow; j_m++) {
+            i1 = i+i_m;
+            j1 = j+j_m;
+            if (((i1 >= 0) && (i1 < N)) && ((j1 >= 0) && (j1 < M))) {
+                normsum = 0.0f; counterG = 0;
+                for(i_c=-SimilarWin; i_c<=SimilarWin; i_c++) {
+                    for(j_c=-SimilarWin; j_c<=SimilarWin; j_c++) {
+                        i2 = i1 + i_c;
+                        j2 = j1 + j_c;
+                        i3 = i + i_c;
+                        j3 = j + j_c;
+                        if (((i2 >= 0) && (i2 < N)) && ((j2 >= 0) && (j2 < M))) {
+                            if (((i3 >= 0) && (i3 < N)) && ((j3 >= 0) && (j3 < M))) {
+                                normsum += Eucl_Vec_d[counterG]*powf(Ad[i3*M + j3] - Ad[i2*M + j2], 2);                                
+                                counterG++;
+                            }}                        
+                     }}
+                /* writing temporarily into vectors */
+                if (normsum > EPS) {
+                    Weight_Vec[counter] = __expf(-normsum/h2);
+                    ind_i[counter] = i1;
+                    ind_j[counter] = j1;
+                    counter++;
+                }
+           }
+        }}
+        
+    /* do sorting to choose the most prominent weights [HIGH to LOW] */
+    /* and re-arrange indeces accordingly */
+    for (x = 0; x < counter-1; x++)  {
+       for (y = 0; y < counter-x-1; y++)  {
+           if (Weight_Vec[y] < Weight_Vec[y+1]) {
+            swap(&Weight_Vec[y], &Weight_Vec[y+1]); 		                       
+            swapUS(&ind_i[y], &ind_i[y+1]);
+            swapUS(&ind_j[y], &ind_j[y+1]);  
+            }
+    	}
+    }     
+    /*sorting loop finished*/        
+    /*now select the NumNeighb more prominent weights and store into arrays */     
+    for(x=0; x < NumNeighb; x++) {
+        index2 = (N*M*x) + index;
+        H_i_d[index2] = ind_i[x];
+        H_j_d[index2] = ind_j[x];
+        Weights_d[index2] = Weight_Vec[x];
+    }
+} 
+__global__ void IndexSelect2D_13_kernel(float *Ad, unsigned short *H_i_d, unsigned short *H_j_d, float *Weights_d, float *Eucl_Vec_d, int N, int M, int SearchWindow, int SearchW_full, int SimilarWin, int NumNeighb, float h2)
+{          
+
+    long i1, j1, i_m, j_m, i_c, j_c, i2, j2, i3, j3, counter, x, y, counterG, index2;
+    float normsum;
+    
+    float Weight_Vec[CONSTVECSIZE13];
+    unsigned short ind_i[CONSTVECSIZE13];
+    unsigned short ind_j[CONSTVECSIZE13];
+
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    int j = blockDim.y * blockIdx.y + threadIdx.y;
+    
+    long index = i*M+j;      
+    
+    counter = 0;
+    for(i_m=-SearchWindow; i_m<=SearchWindow; i_m++) {
+        for(j_m=-SearchWindow; j_m<=SearchWindow; j_m++) {
+            i1 = i+i_m;
+            j1 = j+j_m;
+            if (((i1 >= 0) && (i1 < N)) && ((j1 >= 0) && (j1 < M))) {
+                normsum = 0.0f; counterG = 0;
+                for(i_c=-SimilarWin; i_c<=SimilarWin; i_c++) {
+                    for(j_c=-SimilarWin; j_c<=SimilarWin; j_c++) {
+                        i2 = i1 + i_c;
+                        j2 = j1 + j_c;
+                        i3 = i + i_c;
+                        j3 = j + j_c;
+                        if (((i2 >= 0) && (i2 < N)) && ((j2 >= 0) && (j2 < M))) {
+                            if (((i3 >= 0) && (i3 < N)) && ((j3 >= 0) && (j3 < M))) {
+                                normsum += Eucl_Vec_d[counterG]*powf(Ad[i3*M + j3] - Ad[i2*M + j2], 2);                                
+                                counterG++;
+                            }}                        
+                     }}
+                /* writing temporarily into vectors */
+                if (normsum > EPS) {
+                    Weight_Vec[counter] = __expf(-normsum/h2);
+                    ind_i[counter] = i1;
+                    ind_j[counter] = j1;
+                    counter++;
+                }
+             }
+        }}
+        
+    /* do sorting to choose the most prominent weights [HIGH to LOW] */
+    /* and re-arrange indeces accordingly */
+    for (x = 0; x < counter-1; x++)  {
+       for (y = 0; y < counter-x-1; y++)  {
+           if (Weight_Vec[y] < Weight_Vec[y+1]) {
+            swap(&Weight_Vec[y], &Weight_Vec[y+1]); 		                       
+            swapUS(&ind_i[y], &ind_i[y+1]);
+            swapUS(&ind_j[y], &ind_j[y+1]);  
+            }
+    	}
+    }     
+    /*sorting loop finished*/        
+    /*now select the NumNeighb more prominent weights and store into arrays */     
+    for(x=0; x < NumNeighb; x++) {
+        index2 = (N*M*x) + index;
+        H_i_d[index2] = ind_i[x];
+        H_j_d[index2] = ind_j[x];
+        Weights_d[index2] = Weight_Vec[x];
+    }
+} 
+
+   
+/*%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*/
+/********************* MAIN HOST FUNCTION ******************/
+/*%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*/
+extern "C" void PatchSelect_GPU_main(float *A, unsigned short *H_i, unsigned short *H_j, float *Weights, int N, int M, int SearchWindow, int SimilarWin, int NumNeighb, float h)
+{
+    int deviceCount = -1; // number of devices
+    cudaGetDeviceCount(&deviceCount);
+    if (deviceCount == 0) {
+        fprintf(stderr, "No CUDA devices found\n");
+        return;
+    }  
+      
+    int SearchW_full, SimilW_full, counterG, i, j;
+    float *Ad, *Weights_d, h2, *Eucl_Vec, *Eucl_Vec_d;    
+    unsigned short *H_i_d, *H_j_d;
+    h2 = h*h;
+    
+    dim3 dimBlock(BLKXSIZE,BLKYSIZE);
+    dim3 dimGrid(idivup(N,BLKXSIZE), idivup(M,BLKYSIZE));    
+       
+    SearchW_full = (2*SearchWindow + 1)*(2*SearchWindow + 1); /* the full searching window  size */
+    SimilW_full = (2*SimilarWin + 1)*(2*SimilarWin + 1);   /* the full similarity window  size */
+    
+    /* generate a 2D Gaussian kernel for NLM procedure */
+    Eucl_Vec = (float*) calloc (SimilW_full,sizeof(float));
+    counterG = 0;
+    for(i=-SimilarWin; i<=SimilarWin; i++) {
+         for(j=-SimilarWin; j<=SimilarWin; j++) {
+              Eucl_Vec[counterG] = (float)exp(-(pow(((float) i), 2) + pow(((float) j), 2))/(2.0*SimilarWin*SimilarWin));
+              counterG++;
+    }} /*main neighb loop */
+    
+    
+    /*allocate space on the device*/
+    checkCudaErrors( cudaMalloc((void**)&Ad, N*M*sizeof(float)) );
+    checkCudaErrors( cudaMalloc((void**)&H_i_d, N*M*NumNeighb*sizeof(unsigned short)) );
+    checkCudaErrors( cudaMalloc((void**)&H_j_d, N*M*NumNeighb*sizeof(unsigned short)) );
+    checkCudaErrors( cudaMalloc((void**)&Weights_d, N*M*NumNeighb*sizeof(float)) );
+    checkCudaErrors( cudaMalloc((void**)&Eucl_Vec_d, SimilW_full*sizeof(float)) );
+
+    /* copy data from the host to the device */
+    checkCudaErrors( cudaMemcpy(Ad,A,N*M*sizeof(float),cudaMemcpyHostToDevice) );
+    checkCudaErrors( cudaMemcpy(Eucl_Vec_d,Eucl_Vec,SimilW_full*sizeof(float),cudaMemcpyHostToDevice) );    
+ 
+    /********************** Run CUDA kernel here ********************/
+    if (SearchWindow == 5)  IndexSelect2D_5_kernel<<<dimGrid,dimBlock>>>(Ad, H_i_d, H_j_d, Weights_d, Eucl_Vec_d, N, M, SearchWindow, SearchW_full, SimilarWin, NumNeighb, h2);
+    else if (SearchWindow == 7)  IndexSelect2D_7_kernel<<<dimGrid,dimBlock>>>(Ad, H_i_d, H_j_d, Weights_d, Eucl_Vec_d, N, M, SearchWindow, SearchW_full, SimilarWin, NumNeighb, h2);
+    else if (SearchWindow == 9)  IndexSelect2D_9_kernel<<<dimGrid,dimBlock>>>(Ad, H_i_d, H_j_d, Weights_d, Eucl_Vec_d, N, M, SearchWindow, SearchW_full, SimilarWin, NumNeighb, h2);
+    else if (SearchWindow == 11)  IndexSelect2D_11_kernel<<<dimGrid,dimBlock>>>(Ad, H_i_d, H_j_d, Weights_d, Eucl_Vec_d, N, M, SearchWindow, SearchW_full, SimilarWin, NumNeighb, h2);
+    else if (SearchWindow == 13)  IndexSelect2D_13_kernel<<<dimGrid,dimBlock>>>(Ad, H_i_d, H_j_d, Weights_d, Eucl_Vec_d, N, M, SearchWindow, SearchW_full, SimilarWin, NumNeighb, h2);
+    else {
+    fprintf(stderr, "Select the searching window size from 5, 7, 9, 11 or 13\n");
+        return;}    
+    checkCudaErrors(cudaPeekAtLastError() );        
+    checkCudaErrors(cudaDeviceSynchronize());   
+    /***************************************************************/    
+        
+    checkCudaErrors(cudaMemcpy(H_i, H_i_d, N*M*NumNeighb*sizeof(unsigned short),cudaMemcpyDeviceToHost) );
+    checkCudaErrors(cudaMemcpy(H_j, H_j_d, N*M*NumNeighb*sizeof(unsigned short),cudaMemcpyDeviceToHost) );  
+    checkCudaErrors(cudaMemcpy(Weights, Weights_d, N*M*NumNeighb*sizeof(float),cudaMemcpyDeviceToHost) );   
+    
+    cudaFree(Ad); 
+    cudaFree(H_i_d); 
+    cudaFree(H_j_d);    
+    cudaFree(Weights_d);    
+    cudaFree(Eucl_Vec_d);
+}
diff --git a/Core/regularisers_GPU/PatchSelect_GPU_core.h b/Core/regularisers_GPU/PatchSelect_GPU_core.h
new file mode 100644
index 0000000..d20fe9f
--- /dev/null
+++ b/Core/regularisers_GPU/PatchSelect_GPU_core.h
@@ -0,0 +1,8 @@
+#ifndef __NLREG_KERNELS_H_
+#define __NLREG_KERNELS_H_
+#include "CCPiDefines.h"
+#include <stdio.h>
+
+extern "C" CCPI_EXPORT void PatchSelect_GPU_main(float *A, unsigned short *H_i, unsigned short *H_j, float *Weights, int N, int M, int SearchWindow, int SimilarWin, int NumNeighb, float h);
+
+#endif 
diff --git a/Readme.md b/Readme.md
index cc74dbc..089c9fe 100644
--- a/Readme.md
+++ b/Readme.md
@@ -1,6 +1,6 @@
 # CCPi-Regularisation Toolkit (CCPi-RGL)
 
-**Iterative image reconstruction (IIR) methods normally require regularisation to stabilise the convergence and make the reconstruction problem more well-posed. CCPi-RGL software consists of 2D/3D regularisation modules for single-channel and multi-channel reconstruction problems. The regularisation modules are well-suited to use with [splitting algorithms](https://en.wikipedia.org/wiki/Augmented_Lagrangian_method#Alternating_direction_method_of_multipliers), such as ADMM and FISTA. Furthermore, the toolkit can be used independently to solve image denoising and inpaiting tasks. The core modules are written in C-OMP and CUDA languages, wrappers for Matlab and Python are provided.** 
+**Iterative image reconstruction (IIR) methods normally require regularisation to stabilise the convergence and make the reconstruction problem (inverse problem) more well-posed. The CCPi-RGL software provides 2D/3D and multi-channel regularisation strategies to ensure better performance of IIR methods. The regularisation modules are well-suited to use with [splitting algorithms](https://en.wikipedia.org/wiki/Augmented_Lagrangian_method#Alternating_direction_method_of_multipliers), such as, [ADMM](https://github.com/dkazanc/ADMM-tomo) and [FISTA](https://github.com/dkazanc/FISTA-tomo). Furthermore, the toolkit can be used for simpler inversion tasks, such as, image denoising, inpaiting, deconvolution etc. The core modules are written in C-OMP and CUDA languages and wrappers for Matlab and Python are provided.** 
 
 <div align="center">
   <img src="docs/images/probl.png" height="225"><br>  
@@ -10,6 +10,10 @@
   <img src="docs/images/reg_penalties.jpg" height="450"><br>  
 </div>
 
+<div align="center">
+  <img src="docs/images/TV_vs_NLTV.jpg" height="300"><br>  
+</div>
+
 ## Prerequisites: 
 
  * [MATLAB](www.mathworks.com/products/matlab/) OR
@@ -27,7 +31,7 @@
 5. Linear and nonlinear diffusion (explicit PDE minimisation scheme) **2D/3D CPU/GPU** (Ref. *8*)
 6. Anisotropic Fourth-Order Diffusion (explicit PDE minimisation) **2D/3D CPU/GPU** (Ref. *9*)
 7. A joint ROF-LLT (Lysaker-Lundervold-Tai) model for higher-order regularisation **2D/3D CPU/GPU** (Ref. *10,11*)
-8. Nonlocal Total Variation regularisation (GS fixed point iteration) **2D/3D CPU/GPU** (Ref. *12*)
+8. Nonlocal Total Variation regularisation (GS fixed point iteration) **2D CPU/GPU** (Ref. *12*)
 
 ### Multi-channel (denoising):
 1. Fast-Gradient-Projection (FGP) Directional Total Variation **2D/3D CPU/GPU** (Ref. *3,4,2*)
@@ -158,7 +162,7 @@ addpath(/path/to/library);
 
 ### Applications:
 
-* [Regularised FISTA iterative reconstruction algorithm for X-ray tomographic reconstruction with highly inaccurate measurements (MATLAB code)](https://github.com/dkazanc/FISTA-tomo)
+* [Regularised FISTA iterative reconstruction algorithm for X-ray tomographic reconstruction with highly inaccurate measurements (MATLAB/Python code)](https://github.com/dkazanc/FISTA-tomo)
 * [Regularised ADMM iterative reconstruction algorithm for X-ray tomographic reconstruction (MATLAB code)](https://github.com/dkazanc/ADMM-tomo)
 * [Joint image reconstruction method with correlative multi-channel prior for X-ray spectral computed tomography (MATLAB code)](https://github.com/dkazanc/multi-channel-X-ray-CT)
 
diff --git a/Wrappers/Matlab/demos/demoMatlab_denoise.m b/Wrappers/Matlab/demos/demoMatlab_denoise.m
index 54b8bac..3506cca 100644
--- a/Wrappers/Matlab/demos/demoMatlab_denoise.m
+++ b/Wrappers/Matlab/demos/demoMatlab_denoise.m
@@ -138,17 +138,17 @@ figure; imshow(u_diff4, [0 1]); title('Diffusion 4thO denoised image (CPU)');
 fprintf('Weights pre-calculation for Non-local TV (takes time on CPU) \n');
 SearchingWindow = 7;
 PatchWindow = 2;
-NeighboursNumber = 15; % the number of neibours to include
+NeighboursNumber = 20; % the number of neibours to include
 h = 0.23; % edge related parameter for NLM
-[H_i, H_j, Weights] = PatchSelect(single(u0), SearchingWindow, PatchWindow, NeighboursNumber, h);
+tic; [H_i, H_j, Weights] = PatchSelect(single(u0), SearchingWindow, PatchWindow, NeighboursNumber, h); toc;
 %%
 fprintf('Denoise using Non-local Total Variation (CPU) \n');
-iter_nltv = 2; % number of nltv iterations
-lambda_nltv = 0.085; % regularisation parameter for nltv
+iter_nltv = 3; % number of nltv iterations
+lambda_nltv = 0.05; % regularisation parameter for nltv
 tic; u_nltv = Nonlocal_TV(single(u0), H_i, H_j, 0, Weights, lambda_nltv, iter_nltv); toc; 
 rmse_nltv = (RMSE(u_nltv(:),Im(:)));
 fprintf('%s %f \n', 'RMSE error for Non-local Total Variation is:', rmse_nltv);
-figure; imshow(u_nltv, [0 1]); title('Non-local Total Variation denoised image (CPU)');
+figure; imagesc(u_nltv, [0 1]); colormap(gray); daspect([1 1 1]); title('Non-local Total Variation denoised image (CPU)');
 %%
 %>>>>>>>>>>>>>> MULTI-CHANNEL priors <<<<<<<<<<<<<<< %
 
diff --git a/Wrappers/Python/ccpi/filters/regularisers.py b/Wrappers/Python/ccpi/filters/regularisers.py
index bf7e23c..0a65590 100644
--- a/Wrappers/Python/ccpi/filters/regularisers.py
+++ b/Wrappers/Python/ccpi/filters/regularisers.py
@@ -4,7 +4,7 @@ script which assigns a proper device core function based on a flag ('cpu' or 'gp
 
 from ccpi.filters.cpu_regularisers import TV_ROF_CPU, TV_FGP_CPU, TV_SB_CPU, dTV_FGP_CPU, TNV_CPU, NDF_CPU, Diff4th_CPU, TGV_CPU, LLT_ROF_CPU, PATCHSEL_CPU, NLTV_CPU
 try:
-    from ccpi.filters.gpu_regularisers import TV_ROF_GPU, TV_FGP_GPU, TV_SB_GPU, dTV_FGP_GPU, NDF_GPU, Diff4th_GPU, TGV_GPU, LLT_ROF_GPU
+    from ccpi.filters.gpu_regularisers import TV_ROF_GPU, TV_FGP_GPU, TV_SB_GPU, dTV_FGP_GPU, NDF_GPU, Diff4th_GPU, TGV_GPU, LLT_ROF_GPU, PATCHSEL_GPU
     gpu_enabled = True
 except ImportError:
     gpu_enabled = False    
@@ -153,7 +153,11 @@ def PatchSelect(inputData, searchwindow, patchwindow, neighbours, edge_parameter
                      neighbours, 
                      edge_parameter)
     elif device == 'gpu' and gpu_enabled:
-        return 1
+        return PATCHSEL_GPU(inputData,
+                     searchwindow,
+                     patchwindow,
+                     neighbours, 
+                     edge_parameter)
     else:
         if not gpu_enabled and device == 'gpu':
     	    raise ValueError ('GPU is not available')
diff --git a/Wrappers/Python/conda-recipe/run_test.py b/Wrappers/Python/conda-recipe/run_test.py
index 6ffaca1..499ae7f 100755
--- a/Wrappers/Python/conda-recipe/run_test.py
+++ b/Wrappers/Python/conda-recipe/run_test.py
@@ -2,7 +2,7 @@ import unittest
 import numpy as np
 import os
 import timeit
-from ccpi.filters.regularisers import ROF_TV, FGP_TV, SB_TV, TGV, LLT_ROF, FGP_dTV, NDF, DIFF4th
+from ccpi.filters.regularisers import ROF_TV, FGP_TV, SB_TV, TGV, LLT_ROF, FGP_dTV, NDF, DIFF4th
 from PIL import Image
 
 class TiffReader(object):
diff --git a/Wrappers/Python/demos/demo_cpu_regularisers.py b/Wrappers/Python/demos/demo_cpu_regularisers.py
index 31e4cad..78e9aff 100644
--- a/Wrappers/Python/demos/demo_cpu_regularisers.py
+++ b/Wrappers/Python/demos/demo_cpu_regularisers.py
@@ -400,20 +400,29 @@ plt.title('{}'.format('CPU results'))
 print ("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
 print ("___Nonlocal patches pre-calculation____")
 print ("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
+start_time = timeit.default_timer()
 # set parameters
 pars = {'algorithm' : PatchSelect, \
         'input' : u0,\
         'searchwindow': 7, \
         'patchwindow': 2,\
         'neighbours' : 15 ,\
-        'edge_parameter':0.23}
+        'edge_parameter':0.18}
 
 H_i, H_j, Weights = PatchSelect(pars['input'], 
               pars['searchwindow'],
               pars['patchwindow'], 
               pars['neighbours'],
               pars['edge_parameter'],'cpu')
-
+              
+txtstr = printParametersToString(pars)
+txtstr += "%s = %.3fs" % ('elapsed time',timeit.default_timer() - start_time)
+print (txtstr)
+"""
+plt.figure()
+plt.imshow(Weights[0,:,:],cmap="gray",interpolation="nearest",vmin=0, vmax=1)
+plt.show()
+"""
 #%%
 print ("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
 print ("___Nonlocal Total Variation penalty____")
@@ -431,10 +440,9 @@ pars2 = {'algorithm' : NLTV, \
         'H_j': H_j,\
         'H_k' : 0,\
         'Weights' : Weights,\
-        'regularisation_parameter': 0.085,\
-        'iterations': 2
+        'regularisation_parameter': 0.04,\
+        'iterations': 3
         }
-#%%
 start_time = timeit.default_timer()
 nltv_cpu = NLTV(pars2['input'], 
               pars2['H_i'],
diff --git a/Wrappers/Python/demos/demo_cpu_vs_gpu_regularisers.py b/Wrappers/Python/demos/demo_cpu_vs_gpu_regularisers.py
index 3d6e92f..616eab0 100644
--- a/Wrappers/Python/demos/demo_cpu_vs_gpu_regularisers.py
+++ b/Wrappers/Python/demos/demo_cpu_vs_gpu_regularisers.py
@@ -13,6 +13,7 @@ import numpy as np
 import os
 import timeit
 from ccpi.filters.regularisers import ROF_TV, FGP_TV, SB_TV, TGV, LLT_ROF, FGP_dTV, NDF, DIFF4th
+from ccpi.filters.regularisers import PatchSelect
 from qualitymetrics import rmse
 ###############################################################################
 def printParametersToString(pars):
@@ -732,4 +733,58 @@ if (diff_im.sum() > 1):
     print ("Arrays do not match!")
 else:
     print ("Arrays match")
+#%%
+print ("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
+print ("____Non-local regularisation bench_________")
+print ("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
+
+## plot 
+fig = plt.figure()
+plt.suptitle('Comparison of Nonlocal TV regulariser using CPU and GPU implementations')
+a=fig.add_subplot(1,2,1)
+a.set_title('Noisy Image')
+imgplot = plt.imshow(u0,cmap="gray")
+
+pars = {'algorithm' : PatchSelect, \
+        'input' : u0,\
+        'searchwindow': 7, \
+        'patchwindow': 2,\
+        'neighbours' : 15 ,\
+        'edge_parameter':0.18}
+
+print ("############## Nonlocal Patches on CPU##################")
+start_time = timeit.default_timer()
+H_i, H_j, WeightsCPU = PatchSelect(pars['input'], 
+              pars['searchwindow'],
+              pars['patchwindow'], 
+              pars['neighbours'],
+              pars['edge_parameter'],'cpu')
+txtstr = printParametersToString(pars)
+txtstr += "%s = %.3fs" % ('elapsed time',timeit.default_timer() - start_time)
+print (txtstr)
+
+print ("############## Nonlocal Patches on GPU##################")
+start_time = timeit.default_timer()
+start_time = timeit.default_timer()
+H_i, H_j, WeightsGPU = PatchSelect(pars['input'], 
+              pars['searchwindow'],
+              pars['patchwindow'], 
+              pars['neighbours'],
+              pars['edge_parameter'],'gpu')
+txtstr = printParametersToString(pars)
+txtstr += "%s = %.3fs" % ('elapsed time',timeit.default_timer() - start_time)
+print (txtstr)
+
+print ("--------Compare the results--------")
+tolerance = 1e-05
+diff_im = np.zeros(np.shape(u0))
+diff_im = abs(WeightsCPU[0,:,:] - WeightsGPU[0,:,:])
+diff_im[diff_im > tolerance] = 1
+a=fig.add_subplot(1,2,2)
+imgplot = plt.imshow(diff_im, vmin=0, vmax=1, cmap="gray")
+plt.title('{}'.format('Pixels larger threshold difference'))
+if (diff_im.sum() > 1):
+    print ("Arrays do not match!")
+else:
+    print ("Arrays match")
 #%%
 \ No newline at end of file
diff --git a/Wrappers/Python/demos/demo_gpu_regularisers.py b/Wrappers/Python/demos/demo_gpu_regularisers.py
index de0cbde..2ada559 100644
--- a/Wrappers/Python/demos/demo_gpu_regularisers.py
+++ b/Wrappers/Python/demos/demo_gpu_regularisers.py
@@ -13,6 +13,7 @@ import numpy as np
 import os
 import timeit
 from ccpi.filters.regularisers import ROF_TV, FGP_TV, SB_TV, TGV, LLT_ROF, FGP_dTV, NDF, DIFF4th
+from ccpi.filters.regularisers import PatchSelect, NLTV
 from qualitymetrics import rmse
 ###############################################################################
 def printParametersToString(pars):
@@ -84,7 +85,7 @@ pars = {'algorithm': ROF_TV, \
         'input' : u0,\
         'regularisation_parameter':0.04,\
         'number_of_iterations': 1200,\
-        'time_marching_parameter': 0.0025        
+        'time_marching_parameter': 0.0025
         }
 print ("##############ROF TV GPU##################")
 start_time = timeit.default_timer()
@@ -394,6 +395,77 @@ plt.title('{}'.format('GPU results'))
 
 #%%
 print ("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
+print ("___Nonlocal patches pre-calculation____")
+print ("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
+start_time = timeit.default_timer()
+# set parameters
+pars = {'algorithm' : PatchSelect, \
+        'input' : u0,\
+        'searchwindow': 7, \
+        'patchwindow': 2,\
+        'neighbours' : 15 ,\
+        'edge_parameter':0.18}
+
+H_i, H_j, Weights = PatchSelect(pars['input'], 
+              pars['searchwindow'],
+              pars['patchwindow'], 
+              pars['neighbours'],
+              pars['edge_parameter'],'gpu')
+              
+txtstr = printParametersToString(pars)
+txtstr += "%s = %.3fs" % ('elapsed time',timeit.default_timer() - start_time)
+print (txtstr)
+"""
+plt.figure()
+plt.imshow(Weights[0,:,:],cmap="gray",interpolation="nearest",vmin=0, vmax=1)
+plt.show()
+"""
+#%%
+print ("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
+print ("___Nonlocal Total Variation penalty____")
+print ("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
+## plot 
+fig = plt.figure()
+plt.suptitle('Performance of NLTV regulariser using the CPU')
+a=fig.add_subplot(1,2,1)
+a.set_title('Noisy Image')
+imgplot = plt.imshow(u0,cmap="gray")
+
+pars2 = {'algorithm' : NLTV, \
+        'input' : u0,\
+        'H_i': H_i, \
+        'H_j': H_j,\
+        'H_k' : 0,\
+        'Weights' : Weights,\
+        'regularisation_parameter': 0.02,\
+        'iterations': 3
+        }
+start_time = timeit.default_timer()
+nltv_cpu = NLTV(pars2['input'], 
+              pars2['H_i'],
+              pars2['H_j'], 
+              pars2['H_k'],
+              pars2['Weights'],
+              pars2['regularisation_parameter'],
+              pars2['iterations'])
+
+rms = rmse(Im, nltv_cpu)
+pars['rmse'] = rms
+
+txtstr = printParametersToString(pars)
+txtstr += "%s = %.3fs" % ('elapsed time',timeit.default_timer() - start_time)
+print (txtstr)
+a=fig.add_subplot(1,2,2)
+
+# these are matplotlib.patch.Patch properties
+props = dict(boxstyle='round', facecolor='wheat', alpha=0.75)
+# place a text box in upper left in axes coords
+a.text(0.15, 0.25, txtstr, transform=a.transAxes, fontsize=14,
+         verticalalignment='top', bbox=props)
+imgplot = plt.imshow(nltv_cpu, cmap="gray")
+plt.title('{}'.format('CPU results'))
+#%%
+print ("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
 print ("____________FGP-dTV bench___________________")
 print ("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
 
diff --git a/Wrappers/Python/setup-regularisers.py.in b/Wrappers/Python/setup-regularisers.py.in
index 542dcb4..462edda 100644
--- a/Wrappers/Python/setup-regularisers.py.in
+++ b/Wrappers/Python/setup-regularisers.py.in
@@ -45,6 +45,7 @@ extra_include_dirs += [os.path.join(".." , ".." , "Core"),
                        os.path.join(".." , ".." , "Core",  "regularisers_GPU" , "NDF" ) ,
                        os.path.join(".." , ".." , "Core",  "regularisers_GPU" , "dTV_FGP" ) , 
                        os.path.join(".." , ".." , "Core",  "regularisers_GPU" , "DIFF4th" ) , 
+                       os.path.join(".." , ".." , "Core",  "regularisers_GPU" , "PatchSelect" ) ,
 						   "."]
 
 if platform.system() == 'Windows':				   
diff --git a/Wrappers/Python/src/cpu_regularisers.pyx b/Wrappers/Python/src/cpu_regularisers.pyx
index e51e6d8..4aa3251 100644
--- a/Wrappers/Python/src/cpu_regularisers.pyx
+++ b/Wrappers/Python/src/cpu_regularisers.pyx
@@ -456,7 +456,7 @@ def PATCHSEL_CPU(inputData, searchwindow, patchwindow, neighbours, edge_paramete
     if inputData.ndim == 2:
         return PatchSel_2D(inputData, searchwindow, patchwindow, neighbours, edge_parameter)
     elif inputData.ndim == 3:
-        return PatchSel_3D(inputData, searchwindow, patchwindow, neighbours, edge_parameter)
+        return 1
 def PatchSel_2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData,
                      int searchwindow,
                      int patchwindow,
@@ -480,7 +480,7 @@ def PatchSel_2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData,
     # Run patch-based weight selection function
     PatchSelect_CPU_main(&inputData[0,0], &H_j[0,0,0], &H_i[0,0,0], &H_i[0,0,0], &Weights[0,0,0], dims[2], dims[1], 0, searchwindow, patchwindow,  neighbours,  edge_parameter, 1)
     return H_i, H_j, Weights
-
+"""
 def PatchSel_3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData,
                      int searchwindow,
                      int patchwindow,
@@ -507,7 +507,7 @@ def PatchSel_3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData,
     # Run patch-based weight selection function
     PatchSelect_CPU_main(&inputData[0,0,0], &H_i[0,0,0,0], &H_j[0,0,0,0], &H_k[0,0,0,0], &Weights[0,0,0,0], dims[2], dims[1], dims[0], searchwindow, patchwindow,  neighbours, edge_parameter, 1)
     return H_i, H_j, H_k, Weights
-
+"""
 
 #****************************************************************#
 #***************Non-local Total Variation******************#
diff --git a/Wrappers/Python/src/gpu_regularisers.pyx b/Wrappers/Python/src/gpu_regularisers.pyx
index 82d3e01..302727e 100644
--- a/Wrappers/Python/src/gpu_regularisers.pyx
+++ b/Wrappers/Python/src/gpu_regularisers.pyx
@@ -26,6 +26,7 @@ cdef extern void LLT_ROF_GPU_main(float *Input, float *Output, float lambdaROF,
 cdef extern void NonlDiff_GPU_main(float *Input, float *Output, float lambdaPar, float sigmaPar, int iterationsNumb, float tau, int penaltytype, int N, int M, int Z);
 cdef extern void dTV_FGP_GPU_main(float *Input, float *InputRef, float *Output, float lambdaPar, int iterationsNumb, float epsil, float eta, int methodTV, int nonneg, int printM, int N, int M, int Z);
 cdef extern void Diffus4th_GPU_main(float *Input, float *Output, float lambdaPar, float sigmaPar, int iterationsNumb, float tau, int N, int M, int Z);
+cdef extern void PatchSelect_GPU_main(float *Input, unsigned short *H_i, unsigned short *H_j, float *Weights, int N, int M, int SearchWindow, int SimilarWin, int NumNeighb, float h);
 
 # Total-variation Rudin-Osher-Fatemi (ROF)
 def TV_ROF_GPU(inputData,
@@ -542,3 +543,34 @@ def Diff4th_3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData,
     Diffus4th_GPU_main(&inputData[0,0,0], &outputData[0,0,0], regularisation_parameter, edge_parameter, iterationsNumb, time_marching_parameter, dims[2], dims[1], dims[0])
 
     return outputData
+#****************************************************************#
+#************Patch-based weights pre-selection******************#
+#****************************************************************#
+def PATCHSEL_GPU(inputData, searchwindow, patchwindow, neighbours, edge_parameter):
+    if inputData.ndim == 2:
+        return PatchSel_2D(inputData, searchwindow, patchwindow, neighbours, edge_parameter)
+    elif inputData.ndim == 3:
+        return 1
+def PatchSel_2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData,
+                     int searchwindow,
+                     int patchwindow,
+                     int neighbours,
+                     float edge_parameter):
+    cdef long dims[3]
+    dims[0] = neighbours
+    dims[1] = inputData.shape[0]
+    dims[2] = inputData.shape[1]    
+    
+    cdef np.ndarray[np.float32_t, ndim=3, mode="c"] Weights = \
+            np.zeros([dims[0], dims[1],dims[2]], dtype='float32')
+    
+    cdef np.ndarray[np.uint16_t, ndim=3, mode="c"] H_i = \
+            np.zeros([dims[0], dims[1],dims[2]], dtype='uint16')
+            
+    cdef np.ndarray[np.uint16_t, ndim=3, mode="c"] H_j = \
+            np.zeros([dims[0], dims[1],dims[2]], dtype='uint16')
+
+    # Run patch-based weight selection function
+    PatchSelect_GPU_main(&inputData[0,0], &H_j[0,0,0], &H_i[0,0,0], &Weights[0,0,0], dims[2], dims[1], searchwindow, patchwindow,  neighbours,  edge_parameter)
+    
+    return H_i, H_j, Weights      
diff --git a/docs/images/TV_vs_NLTV.jpg b/docs/images/TV_vs_NLTV.jpg
new file mode 100644
index 0000000..e976512
--- /dev/null
+++ b/docs/images/TV_vs_NLTV.jpg
author	TomasKulhanek <tomas.kulhanek@stfc.ac.uk>	2018-12-06 14:56:39 +0000
committer	TomasKulhanek <tomas.kulhanek@stfc.ac.uk>	2018-12-06 14:56:39 +0000
commit	c04b85a6fdd8c63e3363c8072cbfe4b97409dc60 (patch)
tree	0b12e391692854435319ee72199cfa57908bcff8
parent	c4d1fccb8fb4a2beaf9b9cfb8e2f38b9cfeac283 (diff)
parent	3bce1f1410303a6833d1e647fba9692ea40fa878 (diff)
download	regularization-c04b85a6fdd8c63e3363c8072cbfe4b97409dc60.tar.gz regularization-c04b85a6fdd8c63e3363c8072cbfe4b97409dc60.tar.bz2 regularization-c04b85a6fdd8c63e3363c8072cbfe4b97409dc60.tar.xz regularization-c04b85a6fdd8c63e3363c8072cbfe4b97409dc60.zip