summaryrefslogtreecommitdiffstats
path: root/main_func
diff options
context:
space:
mode:
Diffstat (limited to 'main_func')
-rw-r--r--main_func/FISTA_REC.m573
-rw-r--r--main_func/compile_mex.m11
-rw-r--r--main_func/regularizers_CPU/FGP_TV.c47
-rw-r--r--main_func/regularizers_CPU/FGP_TV_core.c75
-rw-r--r--main_func/regularizers_CPU/FGP_TV_core.h2
-rw-r--r--main_func/regularizers_CPU/LLT_model.c1
-rw-r--r--main_func/regularizers_CPU/LLT_model_core.c25
-rw-r--r--main_func/regularizers_CPU/LLT_model_core.h25
-rw-r--r--main_func/regularizers_CPU/PatchBased_Regul.c43
-rw-r--r--main_func/regularizers_CPU/PatchBased_Regul_core.c57
-rw-r--r--main_func/regularizers_CPU/SplitBregman_TV_core.c1
-rw-r--r--main_func/regularizers_CPU/TGV_PD.c72
-rw-r--r--main_func/regularizers_CPU/TGV_PD_core.c1
-rw-r--r--main_func/regularizers_CPU/TGV_PD_core.h3
-rw-r--r--main_func/regularizers_GPU/Diffus_HO/Diff4thHajiaboli_GPU.cpp114
-rw-r--r--main_func/regularizers_GPU/Diffus_HO/Diff4th_GPU_kernel.cu270
-rw-r--r--main_func/regularizers_GPU/Diffus_HO/Diff4th_GPU_kernel.h6
-rw-r--r--main_func/regularizers_GPU/NL_Regul/NLM_GPU.cpp171
-rw-r--r--main_func/regularizers_GPU/NL_Regul/NLM_GPU_kernel.cu239
-rw-r--r--main_func/regularizers_GPU/NL_Regul/NLM_GPU_kernel.h6
20 files changed, 1479 insertions, 263 deletions
diff --git a/main_func/FISTA_REC.m b/main_func/FISTA_REC.m
index 2823691..6987dca 100644
--- a/main_func/FISTA_REC.m
+++ b/main_func/FISTA_REC.m
@@ -1,24 +1,48 @@
function [X, output] = FISTA_REC(params)
-% <<<< FISTA-based reconstruction algorithm using ASTRA-toolbox >>>>
+% <<<< FISTA-based reconstruction routine using ASTRA-toolbox >>>>
+% This code solves regularised PWLS problem using FISTA approach.
+% The code contains multiple regularisation penalties as well as it can be
+% accelerated by using ordered-subset version. Various projection
+% geometries supported.
+
+% DISCLAIMER
+% It is recommended to use ASTRA version 1.8 or later in order to avoid
+% crashing due to GPU memory overflow for big datasets
+
% ___Input___:
% params.[] file:
+%----------------General Parameters------------------------
% - .proj_geom (geometry of the projector) [required]
% - .vol_geom (geometry of the reconstructed object) [required]
% - .sino (vectorized in 2D or 3D sinogram) [required]
% - .iterFISTA (iterations for the main loop, default 40)
% - .L_const (Lipschitz constant, default Power method) )
% - .X_ideal (ideal image, if given)
-% - .weights (statisitcal weights, size of the sinogram)
+% - .weights (statisitcal weights for the PWLS model, size of the sinogram)
+% - .fidelity (use 'studentt' fidelity)
% - .ROI (Region-of-interest, only if X_ideal is given)
% - .initialize (a 'warm start' using SIRT method from ASTRA)
%----------------Regularization choices------------------------
-% - .Regul_Lambda_FGPTV (FGP-TV regularization parameter)
-% - .Regul_Lambda_SBTV (SplitBregman-TV regularization parameter)
-% - .Regul_Lambda_TVLLT (Higher order SB-LLT regularization parameter)
+% 1 .Regul_Lambda_FGPTV (FGP-TV regularization parameter)
+% 2 .Regul_Lambda_SBTV (SplitBregman-TV regularization parameter)
+% 3 .Regul_LambdaLLT (Higher order LLT regularization parameter)
+% 3.1 .Regul_tauLLT (time step parameter for LLT (HO) term)
+% 4 .Regul_LambdaPatchBased_CPU (Patch-based nonlocal regularization parameter)
+% 4.1 .Regul_PB_SearchW (ratio of the searching window (e.g. 3 = (2*3+1) = 7 pixels window))
+% 4.2 .Regul_PB_SimilW (ratio of the similarity window (e.g. 1 = (2*1+1) = 3 pixels window))
+% 4.3 .Regul_PB_h (PB penalty function threshold)
+% 5 .Regul_LambdaPatchBased_GPU (Patch-based nonlocal regularization parameter)
+% 5.1 .Regul_PB_SearchW (ratio of the searching window (e.g. 3 = (2*3+1) = 7 pixels window))
+% 5.2 .Regul_PB_SimilW (ratio of the similarity window (e.g. 1 = (2*1+1) = 3 pixels window))
+% 5.3 .Regul_PB_h (PB penalty function threshold)
+% 6 .Regul_LambdaDiffHO (Higher-Order Diffusion regularization parameter)
+% 6.1 .Regul_DiffHO_EdgePar (edge-preserving noise related parameter)
+% 7 .Regul_LambdaTGV (Total Generalized variation regularization parameter)
% - .Regul_tol (tolerance to terminate regul iterations, default 1.0e-04)
% - .Regul_Iterations (iterations for the selected penalty, default 25)
-% - .Regul_tauLLT (time step parameter for LLT term)
+% - .Regul_Dimension ('2D' or '3D' way to apply regularization, '3D' is the default)
+%----------------Ring removal------------------------
% - .Ring_LambdaR_L1 (regularization parameter for L1-ring minimization, if lambdaR_L1 > 0 then switch on ring removal)
% - .Ring_Alpha (larger values can accelerate convergence but check stability, default 1)
%----------------Visualization parameters------------------------
@@ -69,6 +93,15 @@ if (isfield(params,'weights'))
else
weights = ones(size(sino));
end
+if (isfield(params,'fidelity'))
+ studentt = 0;
+ if (strcmp(params.fidelity,'studentt') == 1)
+ studentt = 1;
+ lambdaR_L1 = 0;
+ end
+else
+ studentt = 0;
+end
if (isfield(params,'L_const'))
L_const = params.L_const;
else
@@ -90,16 +123,16 @@ else
for i = 1:niter
[id,x1] = astra_create_backprojection3d_cuda(sqweight.*y, proj_geomT, vol_geomT);
s = norm(x1(:));
- x1 = x1/s;
+ x1 = x1./s;
[sino_id, y] = astra_create_sino3d_cuda(x1, proj_geomT, vol_geomT);
y = sqweight.*y;
astra_mex_data3d('delete', sino_id);
astra_mex_data3d('delete', id);
end
- clear proj_geomT vol_geomT
+ %clear proj_geomT vol_geomT
else
% divergen beam geometry
- fprintf('%s \n', 'Calculating Lipshitz constant for divergen beam geometry...');
+ fprintf('%s \n', 'Calculating Lipshitz constant for divergen beam geometry... will take some time!');
niter = 8; % number of iteration for PM
x1 = rand(N,N,SlicesZ);
sqweight = sqrt(weights);
@@ -150,8 +183,8 @@ if (isfield(params,'Regul_Iterations'))
else
IterationsRegul = 25;
end
-if (isfield(params,'Regul_LambdaHO'))
- lambdaHO = params.Regul_LambdaHO;
+if (isfield(params,'Regul_LambdaLLT'))
+ lambdaHO = params.Regul_LambdaLLT;
else
lambdaHO = 0;
end
@@ -165,6 +198,46 @@ if (isfield(params,'Regul_tauLLT'))
else
tauHO = 0.0001;
end
+if (isfield(params,'Regul_LambdaPatchBased_CPU'))
+ lambdaPB = params.Regul_LambdaPatchBased_CPU;
+else
+ lambdaPB = 0;
+end
+if (isfield(params,'Regul_LambdaPatchBased_GPU'))
+ lambdaPB_GPU = params.Regul_LambdaPatchBased_GPU;
+else
+ lambdaPB_GPU = 0;
+end
+if (isfield(params,'Regul_PB_SearchW'))
+ SearchW = params.Regul_PB_SearchW;
+else
+ SearchW = 3; % default
+end
+if (isfield(params,'Regul_PB_SimilW'))
+ SimilW = params.Regul_PB_SimilW;
+else
+ SimilW = 1; % default
+end
+if (isfield(params,'Regul_PB_h'))
+ h_PB = params.Regul_PB_h;
+else
+ h_PB = 0.1; % default
+end
+if (isfield(params,'Regul_LambdaDiffHO'))
+ LambdaDiff_HO = params.Regul_LambdaDiffHO;
+else
+ LambdaDiff_HO = 0;
+end
+if (isfield(params,'Regul_DiffHO_EdgePar'))
+ LambdaDiff_HO_EdgePar = params.Regul_DiffHO_EdgePar;
+else
+ LambdaDiff_HO_EdgePar = 0.01;
+end
+if (isfield(params,'Regul_LambdaTGV'))
+ LambdaTGV = params.Regul_LambdaTGV;
+else
+ LambdaTGV = 0;
+end
if (isfield(params,'Ring_LambdaR_L1'))
lambdaR_L1 = params.Ring_LambdaR_L1;
else
@@ -175,6 +248,14 @@ if (isfield(params,'Ring_Alpha'))
else
alpha_ring = 1;
end
+if (isfield(params,'Regul_Dimension'))
+ Dimension = params.Regul_Dimension;
+ if ((strcmp('2D', Dimension) ~= 1) && (strcmp('3D', Dimension) ~= 1))
+ Dimension = '3D';
+ end
+else
+ Dimension = '3D';
+end
if (isfield(params,'show'))
show = params.show;
else
@@ -216,94 +297,422 @@ if (isfield(params,'initialize'))
else
X = zeros(N,N,SlicesZ, 'single'); % storage for the solution
end
+if (isfield(params,'subsets'))
+ % Ordered Subsets reorganisation of data and angles
+ subsets = params.subsets; % subsets number
+ angles = proj_geom.ProjectionAngles;
+ binEdges = linspace(min(angles),max(angles),subsets+1);
+
+ % assign values to bins
+ [binsDiscr,~] = histc(angles, [binEdges(1:end-1) Inf]);
+
+ % get rearranged subset indices
+ IndicesReorg = zeros(length(angles),1);
+ counterM = 0;
+ for ii = 1:max(binsDiscr(:))
+ counter = 0;
+ for jj = 1:subsets
+ curr_index = ii+jj-1 + counter;
+ if (binsDiscr(jj) >= ii)
+ counterM = counterM + 1;
+ IndicesReorg(counterM) = curr_index;
+ end
+ counter = (counter + binsDiscr(jj)) - 1;
+ end
+ end
+else
+ subsets = 0; % Classical FISTA
+end
%----------------Reconstruction part------------------------
Resid_error = zeros(iterFISTA,1); % errors vector (if the ground truth is given)
objective = zeros(iterFISTA,1); % objective function values vector
-t = 1;
-X_t = X;
-r = zeros(Detectors,SlicesZ, 'single'); % 2D array (for 3D data) of sparse "ring" vectors
-r_x = r; % another ring variable
-residual = zeros(size(sino),'single');
-
-% Outer FISTA iterations loop
-for i = 1:iterFISTA
-
- X_old = X;
- t_old = t;
- r_old = r;
+if (subsets == 0)
+ % Classical FISTA
+ t = 1;
+ X_t = X;
- [sino_id, sino_updt] = astra_create_sino3d_cuda(X_t, proj_geom, vol_geom);
+ r = zeros(Detectors,SlicesZ, 'single'); % 2D array (for 3D data) of sparse "ring" vectors
+ r_x = r; % another ring variable
+ residual = zeros(size(sino),'single');
- if (lambdaR_L1 > 0)
- % ring removal part (Group-Huber fidelity)
- for kkk = 1:anglesNumb
- residual(:,kkk,:) = squeeze(weights(:,kkk,:)).*(squeeze(sino_updt(:,kkk,:)) - (squeeze(sino(:,kkk,:)) - alpha_ring.*r_x));
- end
- vec = sum(residual,2);
- if (SlicesZ > 1)
- vec = squeeze(vec(:,1,:));
+ % Outer FISTA iterations loop
+ for i = 1:iterFISTA
+
+ X_old = X;
+ t_old = t;
+ r_old = r;
+
+ % if the geometry is parallel use slice-by-slice projection-backprojection routine
+ if (strcmp(proj_geom.type,'parallel') || strcmp(proj_geom.type,'parallel3d'))
+ sino_updt = zeros(size(sino),'single');
+ for kkk = 1:SlicesZ
+ [sino_id, sino_updt(:,:,kkk)] = astra_create_sino3d_cuda(X_t(:,:,kkk), proj_geomT, vol_geomT);
+ astra_mex_data3d('delete', sino_id);
+ end
+ else
+ % for divergent 3D geometry (watch the GPU memory overflow in earlier ASTRA versions < 1.8)
+ [sino_id, sino_updt] = astra_create_sino3d_cuda(X_t, proj_geom, vol_geom);
+ end
+
+ if (lambdaR_L1 > 0)
+ % the ring removal part (Group-Huber fidelity)
+ for kkk = 1:anglesNumb
+ residual(:,kkk,:) = squeeze(weights(:,kkk,:)).*(squeeze(sino_updt(:,kkk,:)) - (squeeze(sino(:,kkk,:)) - alpha_ring.*r_x));
+ end
+ vec = sum(residual,2);
+ if (SlicesZ > 1)
+ vec = squeeze(vec(:,1,:));
+ end
+ r = r_x - (1./L_const).*vec;
+ objective(i) = (0.5*sum(residual(:).^2)); % for the objective function output
+ else
+ if (studentt == 1)
+ % artifacts removal with Students t penalty
+ residual = weights.*(sino_updt - sino);
+ for kkk = 1:SlicesZ
+ res_vec = reshape(residual(:,:,kkk), Detectors*anglesNumb, 1); % 1D vectorized sinogram
+ %s = 100;
+ %gr = (2)*res_vec./(s*2 + conj(res_vec).*res_vec);
+ [ff, gr] = studentst(res_vec, 1);
+ residual(:,:,kkk) = reshape(gr, Detectors, anglesNumb);
+ end
+ objective(i) = ff; % for the objective function output
+ else
+ % no ring removal (LS model)
+ residual = weights.*(sino_updt - sino);
+ objective(i) = (0.5*sum(residual(:).^2)); % for the objective function output
+ end
+ end
+
+ % if the geometry is parallel use slice-by-slice projection-backprojection routine
+ if (strcmp(proj_geom.type,'parallel') || strcmp(proj_geom.type,'parallel3d'))
+ x_temp = zeros(size(X),'single');
+ for kkk = 1:SlicesZ
+ [id, x_temp(:,:,kkk)] = astra_create_backprojection3d_cuda(squeeze(residual(:,:,kkk)), proj_geomT, vol_geomT);
+ astra_mex_data3d('delete', id);
+ end
+ else
+ [id, x_temp] = astra_create_backprojection3d_cuda(residual, proj_geom, vol_geom);
+ end
+ X = X_t - (1/L_const).*x_temp;
+ astra_mex_data3d('delete', sino_id);
+ astra_mex_data3d('delete', id);
+
+ % regularization
+ if (lambdaFGP_TV > 0)
+ % FGP-TV regularization
+ if ((strcmp('2D', Dimension) == 1))
+ % 2D regularization
+ for kkk = 1:SlicesZ
+ [X(:,:,kkk), f_val] = FGP_TV(single(X(:,:,kkk)), lambdaFGP_TV, IterationsRegul, tol, 'iso');
+ end
+ else
+ % 3D regularization
+ [X, f_val] = FGP_TV(single(X), lambdaFGP_TV, IterationsRegul, tol, 'iso');
+ end
+ objective(i) = (objective(i) + f_val)./(Detectors*anglesNumb*SlicesZ);
+ end
+ if (lambdaSB_TV > 0)
+ % Split Bregman regularization
+ if ((strcmp('2D', Dimension) == 1))
+ % 2D regularization
+ for kkk = 1:SlicesZ
+ X(:,:,kkk) = SplitBregman_TV(single(X(:,:,kkk)), lambdaSB_TV, IterationsRegul, tol); % (more memory efficent)
+ end
+ else
+ % 3D regularization
+ X = SplitBregman_TV(single(X), lambdaSB_TV, IterationsRegul, tol); % (more memory efficent)
+ end
+ end
+ if (lambdaHO > 0)
+ % Higher Order (LLT) regularization
+ X2 = zeros(N,N,SlicesZ,'single');
+ if ((strcmp('2D', Dimension) == 1))
+ % 2D regularization
+ for kkk = 1:SlicesZ
+ X2(:,:,kkk) = LLT_model(single(X(:,:,kkk)), lambdaHO, tauHO, iterHO, 3.0e-05, 0);
+ end
+ else
+ % 3D regularization
+ X2 = LLT_model(single(X), lambdaHO, tauHO, iterHO, 3.0e-05, 0);
+ end
+ X = 0.5.*(X + X2); % averaged combination of two solutions
+
+ end
+ if (lambdaPB > 0)
+ % Patch-Based regularization (can be very slow on CPU)
+ if ((strcmp('2D', Dimension) == 1))
+ % 2D regularization
+ for kkk = 1:SlicesZ
+ X(:,:,kkk) = PatchBased_Regul(single(X(:,:,kkk)), SearchW, SimilW, h_PB, lambdaPB);
+ end
+ else
+ X = PatchBased_Regul(single(X), SearchW, SimilW, h_PB, lambdaPB);
+ end
+ end
+ if (lambdaPB_GPU > 0)
+ % Patch-Based regularization (GPU CUDA implementation)
+ if ((strcmp('2D', Dimension) == 1))
+ % 2D regularization
+ for kkk = 1:SlicesZ
+ X(:,:,kkk) = NLM_GPU(single(X(:,:,kkk)), SearchW, SimilW, h_PB, lambdaPB_GPU);
+ end
+ else
+ X = NLM_GPU(single(X), SearchW, SimilW, h_PB, lambdaPB_GPU);
+ end
+ end
+ if (LambdaDiff_HO > 0)
+ % Higher-order diffusion penalty (GPU CUDA implementation)
+ if ((strcmp('2D', Dimension) == 1))
+ % 2D regularization
+ for kkk = 1:SlicesZ
+ X(:,:,kkk) = Diff4thHajiaboli_GPU(single(X(:,:,kkk)), LambdaDiff_HO_EdgePar, LambdaDiff_HO, IterationsRegul);
+ end
+ else
+ X = Diff4thHajiaboli_GPU(X, LambdaDiff_HO_EdgePar, LambdaDiff_HO, IterationsRegul);
+ end
+ end
+ if (LambdaTGV > 0)
+ % Total Generalized variation (currently only 2D)
+ lamTGV1 = 1.1; % smoothing trade-off parameters, see Pock's paper
+ lamTGV2 = 0.8; % second-order term
+ for kkk = 1:SlicesZ
+ X(:,:,kkk) = TGV_PD(single(X(:,:,kkk)), LambdaTGV, lamTGV1, lamTGV2, IterationsRegul);
+ end
+ end
+
+ if (lambdaR_L1 > 0)
+ r = max(abs(r)-lambdaR_L1, 0).*sign(r); % soft-thresholding operator for ring vector
+ end
+
+ t = (1 + sqrt(1 + 4*t^2))/2; % updating t
+ X_t = X + ((t_old-1)/t).*(X - X_old); % updating X
+
+ if (lambdaR_L1 > 0)
+ r_x = r + ((t_old-1)/t).*(r - r_old); % updating r
+ end
+
+ if (show == 1)
+ figure(10); imshow(X(:,:,slice), [0 maxvalplot]);
+ if (lambdaR_L1 > 0)
+ figure(11); plot(r); title('Rings offset vector')
+ end
+ pause(0.01);
+ end
+ if (strcmp(X_ideal, 'none' ) == 0)
+ Resid_error(i) = RMSE(X(ROI), X_ideal(ROI));
+ fprintf('%s %i %s %s %.4f %s %s %f \n', 'Iteration Number:', i, '|', 'Error RMSE:', Resid_error(i), '|', 'Objective:', objective(i));
+ else
+ fprintf('%s %i %s %s %f \n', 'Iteration Number:', i, '|', 'Objective:', objective(i));
end
- r = r_x - (1./L_const).*vec;
- else
- % no ring removal
- residual = weights.*(sino_updt - sino);
- end
-
- objective(i) = (0.5*norm(residual(:))^2)/(Detectors*anglesNumb*SlicesZ); % for the objective function output
-
- [id, x_temp] = astra_create_backprojection3d_cuda(residual, proj_geom, vol_geom);
-
- X = X_t - (1/L_const).*x_temp;
- astra_mex_data3d('delete', sino_id);
- astra_mex_data3d('delete', id);
-
- if (lambdaFGP_TV > 0)
- % FGP-TV regularization
- [X, f_val] = FGP_TV(single(X), lambdaFGP_TV, IterationsRegul, tol, 'iso');
- objective(i) = objective(i) + f_val;
- end
- if (lambdaSB_TV > 0)
- % Split Bregman regularization
- X = SplitBregman_TV(single(X), lambdaSB_TV, IterationsRegul, tol); % (more memory efficent)
- end
- if (lambdaHO > 0)
- % Higher Order (LLT) regularization
- X2 = LLT_model(single(X), lambdaHO, tauHO, iterHO, 3.0e-05, 0);
- X = 0.5.*(X + X2); % averaged combination of two solutions
- end
-
-
-
- if (lambdaR_L1 > 0)
- r = max(abs(r)-lambdaR_L1, 0).*sign(r); % soft-thresholding operator for ring vector
end
+else
+ % Ordered Subsets (OS) FISTA reconstruction routine (normally one order of magnitude faster than classical)
+ t = 1;
+ X_t = X;
+ proj_geomSUB = proj_geom;
- t = (1 + sqrt(1 + 4*t^2))/2; % updating t
- X_t = X + ((t_old-1)/t).*(X - X_old); % updating X
- if (lambdaR_L1 > 0)
- r_x = r + ((t_old-1)/t).*(r - r_old); % updating r
- end
+ r = zeros(Detectors,SlicesZ, 'single'); % 2D array (for 3D data) of sparse "ring" vectors
+ r_x = r; % another ring variable
+ residual2 = zeros(size(sino),'single');
- if (show == 1)
- figure(10); imshow(X(:,:,slice), [0 maxvalplot]);
- if (lambdaR_L1 > 0)
- figure(11); plot(r); title('Rings offset vector')
+ % Outer FISTA iterations loop
+ for i = 1:iterFISTA
+
+ % With OS approach it becomes trickier to correlate independent subsets, hence additional work is required
+ % one solution is to work with a full sinogram at times
+ if ((i >= 3) && (lambdaR_L1 > 0))
+ [sino_id2, sino_updt2] = astra_create_sino3d_cuda(X, proj_geom, vol_geom);
+ astra_mex_data3d('delete', sino_id2);
+ end
+
+ % subsets loop
+ counterInd = 1;
+ for ss = 1:subsets
+ X_old = X;
+ t_old = t;
+ r_old = r;
+
+ numProjSub = binsDiscr(ss); % the number of projections per subset
+ CurrSubIndeces = IndicesReorg(counterInd:(counterInd + numProjSub - 1)); % extract indeces attached to the subset
+ proj_geomSUB.ProjectionAngles = angles(CurrSubIndeces);
+
+ if (lambdaR_L1 > 0)
+
+ % the ring removal part (Group-Huber fidelity)
+ % first 2 iterations do additional work reconstructing whole dataset to ensure
+ % the stablility
+ if (i < 3)
+ [sino_id2, sino_updt2] = astra_create_sino3d_cuda(X_t, proj_geom, vol_geom);
+ astra_mex_data3d('delete', sino_id2);
+ else
+ [sino_id, sino_updt] = astra_create_sino3d_cuda(X_t, proj_geomSUB, vol_geom);
+ end
+
+ for kkk = 1:anglesNumb
+ residual2(:,kkk,:) = squeeze(weights(:,kkk,:)).*(squeeze(sino_updt2(:,kkk,:)) - (squeeze(sino(:,kkk,:)) - alpha_ring.*r_x));
+ end
+
+ residual = zeros(Detectors, numProjSub, SlicesZ,'single');
+ for kkk = 1:numProjSub
+ indC = CurrSubIndeces(kkk);
+ if (i < 3)
+ residual(:,kkk,:) = squeeze(residual2(:,indC,:));
+ else
+ residual(:,kkk,:) = squeeze(weights(:,indC,:)).*(squeeze(sino_updt(:,kkk,:)) - (squeeze(sino(:,indC,:)) - alpha_ring.*r_x));
+ end
+ end
+ vec = sum(residual2,2);
+ if (SlicesZ > 1)
+ vec = squeeze(vec(:,1,:));
+ end
+ r = r_x - (1./L_const).*vec;
+ else
+ [sino_id, sino_updt] = astra_create_sino3d_cuda(X_t, proj_geomSUB, vol_geom);
+
+ if (studentt == 1)
+ % artifacts removal with Students t penalty
+ residual = squeeze(weights(:,CurrSubIndeces,:)).*(sino_updt - squeeze(sino(:,CurrSubIndeces,:)));
+
+ for kkk = 1:SlicesZ
+ res_vec = reshape(residual(:,:,kkk), Detectors*numProjSub, 1); % 1D vectorized sinogram
+ %s = 100;
+ %gr = (2)*res_vec./(s*2 + conj(res_vec).*res_vec);
+ [ff, gr] = studentst(res_vec, 1);
+ residual(:,:,kkk) = reshape(gr, Detectors, numProjSub);
+ end
+ objective(i) = ff; % for the objective function output
+ else
+ % no ring removal (LS model)
+ residual = squeeze(weights(:,CurrSubIndeces,:)).*(sino_updt - squeeze(sino(:,CurrSubIndeces,:)));
+ end
+ end
+
+ [id, x_temp] = astra_create_backprojection3d_cuda(residual, proj_geomSUB, vol_geom);
+
+ X = X_t - (1/L_const).*x_temp;
+ astra_mex_data3d('delete', sino_id);
+ astra_mex_data3d('delete', id);
+
+ % regularization
+ if (lambdaFGP_TV > 0)
+ % FGP-TV regularization
+ if ((strcmp('2D', Dimension) == 1))
+ % 2D regularization
+ for kkk = 1:SlicesZ
+ [X(:,:,kkk), f_val] = FGP_TV(single(X(:,:,kkk)), lambdaFGP_TV/subsets, IterationsRegul, tol, 'iso');
+ end
+ else
+ % 3D regularization
+ [X, f_val] = FGP_TV(single(X), lambdaFGP_TV/subsets, IterationsRegul, tol, 'iso');
+ end
+ objective(i) = objective(i) + f_val;
+ end
+ if (lambdaSB_TV > 0)
+ % Split Bregman regularization
+ if ((strcmp('2D', Dimension) == 1))
+ % 2D regularization
+ for kkk = 1:SlicesZ
+ X(:,:,kkk) = SplitBregman_TV(single(X(:,:,kkk)), lambdaSB_TV/subsets, IterationsRegul, tol); % (more memory efficent)
+ end
+ else
+ % 3D regularization
+ X = SplitBregman_TV(single(X), lambdaSB_TV/subsets, IterationsRegul, tol); % (more memory efficent)
+ end
+ end
+ if (lambdaHO > 0)
+ % Higher Order (LLT) regularization
+ X2 = zeros(N,N,SlicesZ,'single');
+ if ((strcmp('2D', Dimension) == 1))
+ % 2D regularization
+ for kkk = 1:SlicesZ
+ X2(:,:,kkk) = LLT_model(single(X(:,:,kkk)), lambdaHO/subsets, tauHO/subsets, iterHO, 2.0e-05, 0);
+ end
+ else
+ % 3D regularization
+ X2 = LLT_model(single(X), lambdaHO/subsets, tauHO/subsets, iterHO, 2.0e-05, 0);
+ end
+ X = 0.5.*(X + X2); % the averaged combination of two solutions
+ end
+ if (lambdaPB > 0)
+ % Patch-Based regularization (can be slow on CPU)
+ if ((strcmp('2D', Dimension) == 1))
+ % 2D regularization
+ for kkk = 1:SlicesZ
+ X(:,:,kkk) = PatchBased_Regul(single(X(:,:,kkk)), SearchW, SimilW, h_PB, lambdaPB/subsets);
+ end
+ else
+ X = PatchBased_Regul(single(X), SearchW, SimilW, h_PB, lambdaPB/subsets);
+ end
+ end
+ if (lambdaPB_GPU > 0)
+ % Patch-Based regularization (GPU CUDA implementation)
+ if ((strcmp('2D', Dimension) == 1))
+ % 2D regularization
+ for kkk = 1:SlicesZ
+ X(:,:,kkk) = NLM_GPU(single(X(:,:,kkk)), SearchW, SimilW, h_PB, lambdaPB_GPU);
+ end
+ else
+ X = NLM_GPU(single(X), SearchW, SimilW, h_PB, lambdaPB_GPU);
+ end
+ end
+ if (LambdaDiff_HO > 0)
+ % Higher-order diffusion penalty (GPU CUDA implementation)
+ if ((strcmp('2D', Dimension) == 1))
+ % 2D regularization
+ for kkk = 1:SlicesZ
+ X(:,:,kkk) = Diff4thHajiaboli_GPU(single(X(:,:,kkk)), LambdaDiff_HO_EdgePar, LambdaDiff_HO, round(IterationsRegul/subsets));
+ end
+ else
+ X = Diff4thHajiaboli_GPU(X, LambdaDiff_HO_EdgePar, LambdaDiff_HO, round(IterationsRegul/subsets));
+ end
+ end
+ if (LambdaTGV > 0)
+ % Total Generalized variation (currently only 2D)
+ lamTGV1 = 1.1; % smoothing trade-off parameters, see Pock's paper
+ lamTGV2 = 0.5; % second-order term
+ for kkk = 1:SlicesZ
+ X(:,:,kkk) = TGV_PD(single(X(:,:,kkk)), LambdaTGV/subsets, lamTGV1, lamTGV2, IterationsRegul);
+ end
+ end
+
+ if (lambdaR_L1 > 0)
+ r = max(abs(r)-lambdaR_L1, 0).*sign(r); % soft-thresholding operator for ring vector
+ end
+
+ t = (1 + sqrt(1 + 4*t^2))/2; % updating t
+ X_t = X + ((t_old-1)/t).*(X - X_old); % updating X
+
+ if (lambdaR_L1 > 0)
+ r_x = r + ((t_old-1)/t).*(r - r_old); % updating r
+ end
+
+ counterInd = counterInd + numProjSub;
+ end
+
+ if (show == 1)
+ figure(10); imshow(X(:,:,slice), [0 maxvalplot]);
+ if (lambdaR_L1 > 0)
+ figure(11); plot(r); title('Rings offset vector')
+ end
+ pause(0.01);
+ end
+
+ if (strcmp(X_ideal, 'none' ) == 0)
+ Resid_error(i) = RMSE(X(ROI), X_ideal(ROI));
+ fprintf('%s %i %s %s %.4f %s %s %f \n', 'Iteration Number:', i, '|', 'Error RMSE:', Resid_error(i), '|', 'Objective:', objective(i));
+ else
+ fprintf('%s %i %s %s %f \n', 'Iteration Number:', i, '|', 'Objective:', objective(i));
end
- pause(0.01);
- end
- if (strcmp(X_ideal, 'none' ) == 0)
- Resid_error(i) = RMSE(X(ROI), X_ideal(ROI));
- fprintf('%s %i %s %s %.4f %s %s %f \n', 'Iteration Number:', i, '|', 'Error RMSE:', Resid_error(i), '|', 'Objective:', objective(i));
- else
- fprintf('%s %i %s %s %f \n', 'Iteration Number:', i, '|', 'Objective:', objective(i));
end
end
+
output.Resid_error = Resid_error;
output.objective = objective;
output.L_const = L_const;
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
end
diff --git a/main_func/compile_mex.m b/main_func/compile_mex.m
index 7bfa8eb..66c05da 100644
--- a/main_func/compile_mex.m
+++ b/main_func/compile_mex.m
@@ -1,10 +1,11 @@
-% compile mex's once
+% compile mex's in Matlab once
cd regularizers_CPU/
-mex LLT_model.c CFLAGS="\$CFLAGS -fopenmp -Wall -std=c99" LDFLAGS="\$LDFLAGS -fopenmp"
-mex FGP_TV.c CFLAGS="\$CFLAGS -fopenmp -Wall -std=c99" LDFLAGS="\$LDFLAGS -fopenmp"
-mex SplitBregman_TV.c CFLAGS="\$CFLAGS -fopenmp -Wall -std=c99" LDFLAGS="\$LDFLAGS -fopenmp"
-mex TGV_PD.c CFLAGS="\$CFLAGS -fopenmp -Wall -std=c99" LDFLAGS="\$LDFLAGS -fopenmp"
+mex LLT_model.c LLT_model_core.c CFLAGS="\$CFLAGS -fopenmp -Wall -std=c99" LDFLAGS="\$LDFLAGS -fopenmp"
+mex FGP_TV.c FGP_TV_core.c CFLAGS="\$CFLAGS -fopenmp -Wall -std=c99" LDFLAGS="\$LDFLAGS -fopenmp"
+mex SplitBregman_TV.c SplitBregman_TV_core.c CFLAGS="\$CFLAGS -fopenmp -Wall -std=c99" LDFLAGS="\$LDFLAGS -fopenmp"
+mex TGV_PD.c TGV_PD_core.c CFLAGS="\$CFLAGS -fopenmp -Wall -std=c99" LDFLAGS="\$LDFLAGS -fopenmp"
+mex PatchBased_Regul.c PatchBased_Regul_core.c CFLAGS="\$CFLAGS -fopenmp -Wall -std=c99" LDFLAGS="\$LDFLAGS -fopenmp"
cd ../../
cd demos
diff --git a/main_func/regularizers_CPU/FGP_TV.c b/main_func/regularizers_CPU/FGP_TV.c
index b9dc57f..66442c9 100644
--- a/main_func/regularizers_CPU/FGP_TV.c
+++ b/main_func/regularizers_CPU/FGP_TV.c
@@ -55,7 +55,7 @@ void mexFunction(
{
int number_of_dims, iter, dimX, dimY, dimZ, ll, j, count, methTV;
const int *dim_array;
- float *A, *D=NULL, *D_old=NULL, *P1=NULL, *P2=NULL, *P3=NULL, *P1_old=NULL, *P2_old=NULL, *P3_old=NULL, *R1=NULL, *R2=NULL, *R3=NULL, lambda, tk, tkp1, re, re1, re_old, epsil, funcval;
+ float *A, *D=NULL, *D_old=NULL, *P1=NULL, *P2=NULL, *P3=NULL, *P1_old=NULL, *P2_old=NULL, *P3_old=NULL, *R1=NULL, *R2=NULL, *R3=NULL, lambda, tk, tkp1, re, re1, re_old, epsil;
number_of_dims = mxGetNumberOfDimensions(prhs[0]);
dim_array = mxGetDimensions(prhs[0]);
@@ -79,7 +79,6 @@ void mexFunction(
mxFree(penalty_type);
}
/*output function value (last iteration) */
- funcval = 0.0f;
plhs[1] = mxCreateNumericMatrix(1, 1, mxSINGLE_CLASS, mxREAL);
float *funcvalA = (float *) mxGetData(plhs[1]);
@@ -118,7 +117,7 @@ void mexFunction(
/*updating R and t*/
tkp1 = (1.0f + sqrt(1.0f + 4.0f*tk*tk))*0.5f;
- Rupd_func2D(P1, P1_old, P2, P2_old, R1, R2, tkp1, tk, dimX, dimY);
+ Rupd_func2D(P1, P1_old, P2, P2_old, R1, R2, tkp1, tk, dimX, dimY);
/* calculate norm */
re = 0.0f; re1 = 0.0f;
@@ -130,23 +129,17 @@ void mexFunction(
re = sqrt(re)/sqrt(re1);
if (re < epsil) count++;
if (count > 3) {
- Obj_func2D(A, D, P1, P2, lambda, dimX, dimY);
- funcval = 0.0f;
- for(j=0; j<dimX*dimY*dimZ; j++) funcval += pow(D[j],2);
- funcvalA[0] = sqrt(funcval);
+ Obj_func_CALC2D(A, D, funcvalA, lambda, dimX, dimY);
break; }
/* check that the residual norm is decreasing */
if (ll > 2) {
if (re > re_old) {
- Obj_func2D(A, D, P1, P2, lambda, dimX, dimY);
- funcval = 0.0f;
- for(j=0; j<dimX*dimY*dimZ; j++) funcval += pow(D[j],2);
- funcvalA[0] = sqrt(funcval);
+ Obj_func_CALC2D(A, D, funcvalA, lambda, dimX, dimY);
break; }}
re_old = re;
- /*printf("%f %i %i \n", re, ll, count); */
-
+ /*printf("%f %i %i \n", re, ll, count); */
+
/*storing old values*/
copyIm(D, D_old, dimX, dimY, dimZ);
copyIm(P1, P1_old, dimX, dimY, dimZ);
@@ -154,12 +147,7 @@ void mexFunction(
tk = tkp1;
/* calculating the objective function value */
- if (ll == (iter-1)) {
- Obj_func2D(A, D, P1, P2, lambda, dimX, dimY);
- funcval = 0.0f;
- for(j=0; j<dimX*dimY*dimZ; j++) funcval += pow(D[j],2);
- funcvalA[0] = sqrt(funcval);
- }
+ if (ll == (iter-1)) Obj_func_CALC2D(A, D, funcvalA, lambda, dimX, dimY);
}
printf("FGP-TV iterations stopped at iteration %i with the function value %f \n", ll, funcvalA[0]);
}
@@ -203,21 +191,14 @@ void mexFunction(
/* stop if the norm residual is less than the tolerance EPS */
if (re < epsil) count++;
if (count > 3) {
- Obj_func3D(A, D, P1, P2, P3,lambda, dimX, dimY, dimZ);
- funcval = 0.0f;
- for(j=0; j<dimX*dimY*dimZ; j++) funcval += pow(D[j],2);
- funcvalA[0] = sqrt(funcval);
+ Obj_func_CALC3D(A, D, funcvalA, lambda, dimX, dimY, dimZ);
break;}
/* check that the residual norm is decreasing */
if (ll > 2) {
if (re > re_old) {
- Obj_func3D(A, D, P1, P2, P3,lambda, dimX, dimY, dimZ);
- funcval = 0.0f;
- for(j=0; j<dimX*dimY*dimZ; j++) funcval += pow(D[j],2);
- funcvalA[0] = sqrt(funcval);
- break; }}
-
+ Obj_func_CALC3D(A, D, funcvalA, lambda, dimX, dimY, dimZ);
+ }}
re_old = re;
/*printf("%f %i %i \n", re, ll, count); */
@@ -228,13 +209,7 @@ void mexFunction(
copyIm(P3, P3_old, dimX, dimY, dimZ);
tk = tkp1;
- if (ll == (iter-1)) {
- Obj_func3D(A, D, P1, P2, P3,lambda, dimX, dimY, dimZ);
- funcval = 0.0f;
- for(j=0; j<dimX*dimY*dimZ; j++) funcval += pow(D[j],2);
- funcvalA[0] = sqrt(funcval);
- }
-
+ if (ll == (iter-1)) Obj_func_CALC3D(A, D, funcvalA, lambda, dimX, dimY, dimZ);
}
printf("FGP-TV iterations stopped at iteration %i with the function value %f \n", ll, funcvalA[0]);
}
diff --git a/main_func/regularizers_CPU/FGP_TV_core.c b/main_func/regularizers_CPU/FGP_TV_core.c
index ed5d337..03cd445 100644
--- a/main_func/regularizers_CPU/FGP_TV_core.c
+++ b/main_func/regularizers_CPU/FGP_TV_core.c
@@ -19,8 +19,58 @@ limitations under the License.
#include "FGP_TV_core.h"
+/* C-OMP implementation of FGP-TV [1] denoising/regularization model (2D/3D case)
+ *
+ * Input Parameters:
+ * 1. Noisy image/volume [REQUIRED]
+ * 2. lambda - regularization parameter [REQUIRED]
+ * 3. Number of iterations [OPTIONAL parameter]
+ * 4. eplsilon: tolerance constant [OPTIONAL parameter]
+ * 5. TV-type: 'iso' or 'l1' [OPTIONAL parameter]
+ *
+ * Output:
+ * [1] Filtered/regularized image
+ * [2] last function value
+ *
+ * Example of image denoising:
+ * figure;
+ * Im = double(imread('lena_gray_256.tif'))/255; % loading image
+ * u0 = Im + .05*randn(size(Im)); % adding noise
+ * u = FGP_TV(single(u0), 0.05, 100, 1e-04);
+ *
+ * This function is based on the Matlab's code and paper by
+ * [1] Amir Beck and Marc Teboulle, "Fast Gradient-Based Algorithms for Constrained Total Variation Image Denoising and Deblurring Problems"
+ *
+ * D. Kazantsev, 2016-17
+ *
+ */
+
/* 2D-case related Functions */
/*****************************************************************/
+float Obj_func_CALC2D(float *A, float *D, float *funcvalA, float lambda, int dimX, int dimY)
+{
+ int i,j;
+ float f1, f2, val1, val2;
+
+ /*data-related term */
+ f1 = 0.0f;
+ for(i=0; i<dimX*dimY; i++) f1 += pow(D[i] - A[i],2);
+
+ /*TV-related term */
+ f2 = 0.0f;
+ for(i=0; i<dimX; i++) {
+ for(j=0; j<dimY; j++) {
+ /* boundary conditions */
+ if (i == dimX-1) {val1 = 0.0f;} else {val1 = A[(i+1)*dimY + (j)] - A[(i)*dimY + (j)];}
+ if (j == dimY-1) {val2 = 0.0f;} else {val2 = A[(i)*dimY + (j+1)] - A[(i)*dimY + (j)];}
+ f2 += sqrt(pow(val1,2) + pow(val2,2));
+ }}
+
+ /* sum of two terms */
+ funcvalA[0] = 0.5f*f1 + lambda*f2;
+ return *funcvalA;
+}
+
float Obj_func2D(float *A, float *D, float *R1, float *R2, float lambda, int dimX, int dimY)
{
float val1, val2;
@@ -105,6 +155,31 @@ float Rupd_func2D(float *P1, float *P1_old, float *P2, float *P2_old, float *R1,
/* 3D-case related Functions */
/*****************************************************************/
+float Obj_func_CALC3D(float *A, float *D, float *funcvalA, float lambda, int dimX, int dimY, int dimZ)
+{
+ int i,j,k;
+ float f1, f2, val1, val2, val3;
+
+ /*data-related term */
+ f1 = 0.0f;
+ for(i=0; i<dimX*dimY*dimZ; i++) f1 += pow(D[i] - A[i],2);
+
+ /*TV-related term */
+ f2 = 0.0f;
+ for(i=0; i<dimX; i++) {
+ for(j=0; j<dimY; j++) {
+ for(k=0; k<dimZ; k++) {
+ /* boundary conditions */
+ if (i == dimX-1) {val1 = 0.0f;} else {val1 = A[(dimX*dimY)*k + (i+1)*dimY + (j)] - A[(dimX*dimY)*k + (i)*dimY + (j)];}
+ if (j == dimY-1) {val2 = 0.0f;} else {val2 = A[(dimX*dimY)*k + (i)*dimY + (j+1)] - A[(dimX*dimY)*k + (i)*dimY + (j)];}
+ if (k == dimZ-1) {val3 = 0.0f;} else {val3 = A[(dimX*dimY)*(k+1) + (i)*dimY + (j)] - A[(dimX*dimY)*k + (i)*dimY + (j)];}
+ f2 += sqrt(pow(val1,2) + pow(val2,2) + pow(val3,2));
+ }}}
+ /* sum of two terms */
+ funcvalA[0] = 0.5f*f1 + lambda*f2;
+ return *funcvalA;
+}
+
float Obj_func3D(float *A, float *D, float *R1, float *R2, float *R3, float lambda, int dimX, int dimY, int dimZ)
{
float val1, val2, val3;
diff --git a/main_func/regularizers_CPU/FGP_TV_core.h b/main_func/regularizers_CPU/FGP_TV_core.h
index 778d489..6430bf2 100644
--- a/main_func/regularizers_CPU/FGP_TV_core.h
+++ b/main_func/regularizers_CPU/FGP_TV_core.h
@@ -59,11 +59,13 @@ float Obj_func2D(float *A, float *D, float *R1, float *R2, float lambda, int dim
float Grad_func2D(float *P1, float *P2, float *D, float *R1, float *R2, float lambda, int dimX, int dimY);
float Proj_func2D(float *P1, float *P2, int methTV, int dimX, int dimY);
float Rupd_func2D(float *P1, float *P1_old, float *P2, float *P2_old, float *R1, float *R2, float tkp1, float tk, int dimX, int dimY);
+float Obj_func_CALC2D(float *A, float *D, float *funcvalA, float lambda, int dimX, int dimY);
float Obj_func3D(float *A, float *D, float *R1, float *R2, float *R3, float lambda, int dimX, int dimY, int dimZ);
float Grad_func3D(float *P1, float *P2, float *P3, float *D, float *R1, float *R2, float *R3, float lambda, int dimX, int dimY, int dimZ);
float Proj_func3D(float *P1, float *P2, float *P3, int dimX, int dimY, int dimZ);
float Rupd_func3D(float *P1, float *P1_old, float *P2, float *P2_old, float *P3, float *P3_old, float *R1, float *R2, float *R3, float tkp1, float tk, int dimX, int dimY, int dimZ);
+float Obj_func_CALC3D(float *A, float *D, float *funcvalA, float lambda, int dimX, int dimY, int dimZ);
#ifdef __cplusplus
}
#endif \ No newline at end of file
diff --git a/main_func/regularizers_CPU/LLT_model.c b/main_func/regularizers_CPU/LLT_model.c
index 19e0109..0b07b47 100644
--- a/main_func/regularizers_CPU/LLT_model.c
+++ b/main_func/regularizers_CPU/LLT_model.c
@@ -20,6 +20,7 @@ limitations under the License.
#include "mex.h"
#include "matrix.h"
#include "LLT_model_core.h"
+
/* C-OMP implementation of Lysaker, Lundervold and Tai (LLT) model of higher order regularization penalty
*
* Input Parameters:
diff --git a/main_func/regularizers_CPU/LLT_model_core.c b/main_func/regularizers_CPU/LLT_model_core.c
index e027231..3a853d2 100644
--- a/main_func/regularizers_CPU/LLT_model_core.c
+++ b/main_func/regularizers_CPU/LLT_model_core.c
@@ -19,6 +19,31 @@ limitations under the License.
#include "LLT_model_core.h"
+/* C-OMP implementation of Lysaker, Lundervold and Tai (LLT) model of higher order regularization penalty
+*
+* Input Parameters:
+* 1. U0 - origanal noise image/volume
+* 2. lambda - regularization parameter
+* 3. tau - time-step for explicit scheme
+* 4. iter - iterations number
+* 5. epsil - tolerance constant (to terminate earlier)
+* 6. switcher - default is 0, switch to (1) to restrictive smoothing in Z dimension (in test)
+*
+* Output:
+* Filtered/regularized image
+*
+* Example:
+* figure;
+* Im = double(imread('lena_gray_256.tif'))/255; % loading image
+* u0 = Im + .03*randn(size(Im)); % adding noise
+* [Den] = LLT_model(single(u0), 10, 0.1, 1);
+*
+* References: Lysaker, Lundervold and Tai (LLT) 2003, IEEE
+*
+* 28.11.16/Harwell
+*/
+
+
float der2D(float *U, float *D1, float *D2, int dimX, int dimY, int dimZ)
{
int i, j, i_p, i_m, j_m, j_p;
diff --git a/main_func/regularizers_CPU/LLT_model_core.h b/main_func/regularizers_CPU/LLT_model_core.h
index 273c89b..13fce5a 100644
--- a/main_func/regularizers_CPU/LLT_model_core.h
+++ b/main_func/regularizers_CPU/LLT_model_core.h
@@ -27,31 +27,6 @@ limitations under the License.
#define EPS 0.01
-/* C-OMP implementation of Lysaker, Lundervold and Tai (LLT) model of higher order regularization penalty
-*
-* Input Parameters:
-* 1. U0 - origanal noise image/volume
-* 2. lambda - regularization parameter
-* 3. tau - time-step for explicit scheme
-* 4. iter - iterations number
-* 5. epsil - tolerance constant (to terminate earlier)
-* 6. switcher - default is 0, switch to (1) to restrictive smoothing in Z dimension (in test)
-*
-* Output:
-* Filtered/regularized image
-*
-* Example:
-* figure;
-* Im = double(imread('lena_gray_256.tif'))/255; % loading image
-* u0 = Im + .03*randn(size(Im)); % adding noise
-* [Den] = LLT_model(single(u0), 10, 0.1, 1);
-*
-*
-* to compile with OMP support: mex LLT_model.c CFLAGS="\$CFLAGS -fopenmp -Wall -std=c99" LDFLAGS="\$LDFLAGS -fopenmp"
-* References: Lysaker, Lundervold and Tai (LLT) 2003, IEEE
-*
-* 28.11.16/Harwell
-*/
/* 2D functions */
#ifdef __cplusplus
extern "C" {
diff --git a/main_func/regularizers_CPU/PatchBased_Regul.c b/main_func/regularizers_CPU/PatchBased_Regul.c
index e5748dc..9c925df 100644
--- a/main_func/regularizers_CPU/PatchBased_Regul.c
+++ b/main_func/regularizers_CPU/PatchBased_Regul.c
@@ -28,27 +28,23 @@ limitations under the License.
* References: 1. Yang Z. & Jacob M. "Nonlocal Regularization of Inverse Problems"
* 2. Kazantsev D. et al. "4D-CT reconstruction with unified spatial-temporal patch-based regularization"
*
- * Input Parameters (mandatory):
- * 1. Image (2D or 3D)
- * 2. ratio of the searching window (e.g. 3 = (2*3+1) = 7 pixels window)
- * 3. ratio of the similarity window (e.g. 1 = (2*1+1) = 3 pixels window)
- * 4. h - parameter for the PB penalty function
- * 5. lambda - regularization parameter
+ * Input Parameters:
+ * 1. Image (2D or 3D) [required]
+ * 2. ratio of the searching window (e.g. 3 = (2*3+1) = 7 pixels window) [optional]
+ * 3. ratio of the similarity window (e.g. 1 = (2*1+1) = 3 pixels window) [optional]
+ * 4. h - parameter for the PB penalty function [optional]
+ * 5. lambda - regularization parameter [optional]
* Output:
* 1. regularized (denoised) Image (N x N)/volume (N x N x N)
*
- * Quick 2D denoising example in Matlab:
+ * 2D denoising example in Matlab:
Im = double(imread('lena_gray_256.tif'))/255; % loading image
u0 = Im + .03*randn(size(Im)); u0(u0<0) = 0; % adding noise
- ImDen = PB_Regul_CPU(single(u0), 3, 1, 0.08, 0.05);
- *
- * Please see more tests in a file:
- TestTemporalSmoothing.m
-
+ ImDen = PatchBased_Regul(single(u0), 3, 1, 0.08, 0.05);
*
* Matlab + C/mex compilers needed
- * to compile with OMP support: mex PB_Regul_CPU.c CFLAGS="\$CFLAGS -fopenmp -Wall" LDFLAGS="\$LDFLAGS -fopenmp"
+ * to compile with OMP support: mex PatchBased_Regul.c CFLAGS="\$CFLAGS -fopenmp -Wall" LDFLAGS="\$LDFLAGS -fopenmp"
*
* D. Kazantsev *
* 02/07/2014
@@ -71,17 +67,23 @@ void mexFunction(
M = dims[1];
Z = dims[2];
- if ((numdims < 2) || (numdims > 3)) {mexErrMsgTxt("The input should be 2D image or 3D volume");}
+ if ((numdims < 2) || (numdims > 3)) {mexErrMsgTxt("The input is 2D image or 3D volume");}
if (mxGetClassID(prhs[0]) != mxSINGLE_CLASS) {mexErrMsgTxt("The input in single precision is required"); }
if(nrhs != 5) mexErrMsgTxt("Five inputs reqired: Image(2D,3D), SearchW, SimilW, Threshold, Regularization parameter");
/*Handling inputs*/
- A = (float *) mxGetData(prhs[0]); /* the image to regularize/filter */
- SearchW_real = (int) mxGetScalar(prhs[1]); /* the searching window ratio */
- SimilW = (int) mxGetScalar(prhs[2]); /* the similarity window ratio */
- h = (float) mxGetScalar(prhs[3]); /* parameter for the PB filtering function */
- lambda = (float) mxGetScalar(prhs[4]); /* regularization parameter */
+ A = (float *) mxGetData(prhs[0]); /* the image/volume to regularize/filter */
+ SearchW_real = 3; /*default value*/
+ SimilW = 1; /*default value*/
+ h = 0.1;
+ lambda = 0.1;
+
+ if ((nrhs == 2) || (nrhs == 3) || (nrhs == 4) || (nrhs == 5)) SearchW_real = (int) mxGetScalar(prhs[1]); /* the searching window ratio */
+ if ((nrhs == 3) || (nrhs == 4) || (nrhs == 5)) SimilW = (int) mxGetScalar(prhs[2]); /* the similarity window ratio */
+ if ((nrhs == 4) || (nrhs == 5)) h = (float) mxGetScalar(prhs[3]); /* parameter for the PB filtering function */
+ if ((nrhs == 5)) lambda = (float) mxGetScalar(prhs[4]); /* regularization parameter */
+
if (h <= 0) mexErrMsgTxt("Parmeter for the PB penalty function should be > 0");
if (lambda <= 0) mexErrMsgTxt(" Regularization parmeter should be > 0");
@@ -90,7 +92,6 @@ void mexFunction(
/* SearchW_full = 2*SearchW + 1; */ /* the full searching window size */
/* SimilW_full = 2*SimilW + 1; */ /* the full similarity window size */
-
padXY = SearchW + 2*SimilW; /* padding sizes */
newsizeX = N + 2*(padXY); /* the X size of the padded array */
@@ -136,4 +137,4 @@ void mexFunction(
switchpad_crop = 1; /*cropping*/
pad_crop(Bp, B, M, N, Z, newsizeY, newsizeX, newsizeZ, padXY, switchpad_crop);
} /*end else ndims*/
-} \ No newline at end of file
+}
diff --git a/main_func/regularizers_CPU/PatchBased_Regul_core.c b/main_func/regularizers_CPU/PatchBased_Regul_core.c
index 6f0a48d..acfb464 100644
--- a/main_func/regularizers_CPU/PatchBased_Regul_core.c
+++ b/main_func/regularizers_CPU/PatchBased_Regul_core.c
@@ -19,40 +19,33 @@ limitations under the License.
#include "PatchBased_Regul_core.h"
-/* C-OMP implementation of patch-based (PB) regularization (2D and 3D cases).
-* This method finds self-similar patches in data and performs one fixed point iteration to mimimize the PB penalty function
-*
-* References: 1. Yang Z. & Jacob M. "Nonlocal Regularization of Inverse Problems"
-* 2. Kazantsev D. et al. "4D-CT reconstruction with unified spatial-temporal patch-based regularization"
-*
-* Input Parameters (mandatory):
-* 1. Image (2D or 3D)
-* 2. ratio of the searching window (e.g. 3 = (2*3+1) = 7 pixels window)
-* 3. ratio of the similarity window (e.g. 1 = (2*1+1) = 3 pixels window)
-* 4. h - parameter for the PB penalty function
-* 5. lambda - regularization parameter
+/* C-OMP implementation of patch-based (PB) regularization (2D and 3D cases).
+ * This method finds self-similar patches in data and performs one fixed point iteration to mimimize the PB penalty function
+ *
+ * References: 1. Yang Z. & Jacob M. "Nonlocal Regularization of Inverse Problems"
+ * 2. Kazantsev D. et al. "4D-CT reconstruction with unified spatial-temporal patch-based regularization"
+ *
+ * Input Parameters:
+ * 1. Image (2D or 3D) [required]
+ * 2. ratio of the searching window (e.g. 3 = (2*3+1) = 7 pixels window) [optional]
+ * 3. ratio of the similarity window (e.g. 1 = (2*1+1) = 3 pixels window) [optional]
+ * 4. h - parameter for the PB penalty function [optional]
+ * 5. lambda - regularization parameter [optional]
-* Output:
-* 1. regularized (denoised) Image (N x N)/volume (N x N x N)
-*
-* Quick 2D denoising example in Matlab:
-Im = double(imread('lena_gray_256.tif'))/255; % loading image
-u0 = Im + .03*randn(size(Im)); u0(u0<0) = 0; % adding noise
-ImDen = PB_Regul_CPU(single(u0), 3, 1, 0.08, 0.05);
-*
-* Please see more tests in a file:
-TestTemporalSmoothing.m
-
-*
-* Matlab + C/mex compilers needed
-* to compile with OMP support: mex PB_Regul_CPU.c CFLAGS="\$CFLAGS -fopenmp -Wall" LDFLAGS="\$LDFLAGS -fopenmp"
-*
-* D. Kazantsev *
-* 02/07/2014
-* Harwell, UK
-*/
+ * Output:
+ * 1. regularized (denoised) Image (N x N)/volume (N x N x N)
+ *
+ * 2D denoising example in Matlab:
+ Im = double(imread('lena_gray_256.tif'))/255; % loading image
+ u0 = Im + .03*randn(size(Im)); u0(u0<0) = 0; % adding noise
+ ImDen = PatchBased_Regul(single(u0), 3, 1, 0.08, 0.05);
+
+ * D. Kazantsev *
+ * 02/07/2014
+ * Harwell, UK
+ */
-/*2D version*/
+/*2D version function */
float PB_FUNC2D(float *A, float *B, int dimX, int dimY, int padXY, int SearchW, int SimilW, float h, float lambda)
{
int i, j, i_n, j_n, i_m, j_m, i_p, j_p, i_l, j_l, i1, j1, i2, j2, i3, j3, i5,j5, count, SimilW_full;
diff --git a/main_func/regularizers_CPU/SplitBregman_TV_core.c b/main_func/regularizers_CPU/SplitBregman_TV_core.c
index ce9ef93..4109a4b 100644
--- a/main_func/regularizers_CPU/SplitBregman_TV_core.c
+++ b/main_func/regularizers_CPU/SplitBregman_TV_core.c
@@ -37,7 +37,6 @@ limitations under the License.
* u0 = Im + .05*randn(size(Im)); u0(u0 < 0) = 0;
* u = SplitBregman_TV(single(u0), 10, 30, 1e-04);
*
-* to compile with OMP support: mex SplitBregman_TV.c CFLAGS="\$CFLAGS -fopenmp -Wall -std=c99" LDFLAGS="\$LDFLAGS -fopenmp"
* References:
* The Split Bregman Method for L1 Regularized Problems, by Tom Goldstein and Stanley Osher.
* D. Kazantsev, 2016*
diff --git a/main_func/regularizers_CPU/TGV_PD.c b/main_func/regularizers_CPU/TGV_PD.c
index 6a7697b..c9cb440 100644
--- a/main_func/regularizers_CPU/TGV_PD.c
+++ b/main_func/regularizers_CPU/TGV_PD.c
@@ -37,9 +37,9 @@ limitations under the License.
* figure;
* Im = double(imread('lena_gray_256.tif'))/255; % loading image
* u0 = Im + .03*randn(size(Im)); % adding noise
- * tic; u = PrimalDual_TGV(single(u0), 0.02, 1.3, 1, 550); toc;
+ * tic; u = TGV_PD(single(u0), 0.02, 1.3, 1, 550); toc;
*
- * to compile with OMP support: mex TGV_PD.c CFLAGS="\$CFLAGS -fopenmp -Wall -std=c99" LDFLAGS="\$LDFLAGS -fopenmp"
+ * to compile with OMP support: mex TGV_PD.c TGV_PD_core.c CFLAGS="\$CFLAGS -fopenmp -Wall -std=c99" LDFLAGS="\$LDFLAGS -fopenmp"
* References:
* K. Bredies "Total Generalized Variation"
*
@@ -53,7 +53,7 @@ void mexFunction(
{
int number_of_dims, iter, dimX, dimY, dimZ, ll;
const int *dim_array;
- float *A, *U, *U_old, *P1, *P2, *P3, *Q1, *Q2, *Q3, *Q4, *Q5, *Q6, *Q7, *Q8, *Q9, *V1, *V1_old, *V2, *V2_old, *V3, *V3_old, lambda, L2, tau, sigma, alpha1, alpha0;
+ float *A, *U, *U_old, *P1, *P2, *Q1, *Q2, *Q3, *V1, *V1_old, *V2, *V2_old, lambda, L2, tau, sigma, alpha1, alpha0;
number_of_dims = mxGetNumberOfDimensions(prhs[0]);
dim_array = mxGetDimensions(prhs[0]);
@@ -88,48 +88,17 @@ void mexFunction(
V1 = (float*)mxGetPr(mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL));
V1_old = (float*)mxGetPr(mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL));
V2 = (float*)mxGetPr(mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL));
- V2_old = (float*)mxGetPr(mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL)); }
- else if (number_of_dims == 3) {
- mexErrMsgTxt("The input data should be 2D");
- /*3D case*/
-// dimZ = dim_array[2];
-// U = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
-//
-// P1 = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
-// P2 = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
-// P3 = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
-//
-// Q1 = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
-// Q2 = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
-// Q3 = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
-// Q4 = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
-// Q5 = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
-// Q6 = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
-// Q7 = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
-// Q8 = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
-// Q9 = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
-//
-// U_old = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
-//
-// V1 = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
-// V1_old = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
-// V2 = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
-// V2_old = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
-// V3 = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
-// V3_old = (float*)mxGetPr(mxCreateNumericArray(3, dim_array, mxSINGLE_CLASS, mxREAL));
- }
- else {mexErrMsgTxt("The input data should be 2D");}
-
-
- /*printf("%i \n", i);*/
- L2 = 12.0; /*Lipshitz constant*/
+ V2_old = (float*)mxGetPr(mxCreateNumericArray(2, dim_array, mxSINGLE_CLASS, mxREAL));
+
+
+ /*printf("%i \n", i);*/
+ L2 = 12.0f; /*Lipshitz constant*/
tau = 1.0/pow(L2,0.5);
sigma = 1.0/pow(L2,0.5);
/*Copy A to U*/
copyIm(A, U, dimX, dimY, dimZ);
- if (number_of_dims == 2) {
/* Here primal-dual iterations begin for 2D */
for(ll = 0; ll < iter; ll++) {
@@ -164,23 +133,12 @@ void mexFunction(
/*get new V*/
newU(V1, V1_old, dimX, dimY, dimZ);
newU(V2, V2_old, dimX, dimY, dimZ);
- } /*end of iterations*/
+ } /*end of iterations*/
}
-
-// /*3D version*/
-// if (number_of_dims == 3) {
-// /* Here primal-dual iterations begin for 3D */
-// for(ll = 0; ll < iter; ll++) {
-//
-// /* Calculate Dual Variable P */
-// DualP_3D(U, V1, V2, V3, P1, P2, P3, dimX, dimY, dimZ, sigma);
-//
-// /*Projection onto convex set for P*/
-// ProjP_3D(P1, P2, P3, dimX, dimY, dimZ, alpha1);
-//
-// /* Calculate Dual Variable Q */
-// DualQ_3D(V1, V2, V2, Q1, Q2, Q3, Q4, Q5, Q6, Q7, Q8, Q9, dimX, dimY, dimZ, sigma);
-//
-// } /*end of iterations*/
-// }
+ else if (number_of_dims == 3) {
+ mexErrMsgTxt("The input data should be a 2D array");
+ /*3D case*/
+ }
+ else {mexErrMsgTxt("The input data should be a 2D array");}
+
}
diff --git a/main_func/regularizers_CPU/TGV_PD_core.c b/main_func/regularizers_CPU/TGV_PD_core.c
index ec7cadb..4139d10 100644
--- a/main_func/regularizers_CPU/TGV_PD_core.c
+++ b/main_func/regularizers_CPU/TGV_PD_core.c
@@ -38,7 +38,6 @@ limitations under the License.
* u0 = Im + .03*randn(size(Im)); % adding noise
* tic; u = PrimalDual_TGV(single(u0), 0.02, 1.3, 1, 550); toc;
*
- * to compile with OMP support: mex TGV_PD.c CFLAGS="\$CFLAGS -fopenmp -Wall -std=c99" LDFLAGS="\$LDFLAGS -fopenmp"
* References:
* K. Bredies "Total Generalized Variation"
*
diff --git a/main_func/regularizers_CPU/TGV_PD_core.h b/main_func/regularizers_CPU/TGV_PD_core.h
index 25a8216..d5378df 100644
--- a/main_func/regularizers_CPU/TGV_PD_core.h
+++ b/main_func/regularizers_CPU/TGV_PD_core.h
@@ -60,9 +60,6 @@ float DualQ_2D(float *V1, float *V2, float *Q1, float *Q2, float *Q3, int dimX,
float ProjQ_2D(float *Q1, float *Q2, float *Q3, int dimX, int dimY, int dimZ, float alpha0);
float DivProjP_2D(float *U, float *A, float *P1, float *P2, int dimX, int dimY, int dimZ, float lambda, float tau);
float UpdV_2D(float *V1, float *V2, float *P1, float *P2, float *Q1, float *Q2, float *Q3, int dimX, int dimY, int dimZ, float tau);
-/*3D functions*/
-float DualP_3D(float *U, float *V1, float *V2, float *V3, float *P1, float *P2, float *P3, int dimX, int dimY, int dimZ, float sigma);
-
float newU(float *U, float *U_old, int dimX, int dimY, int dimZ);
//float copyIm(float *A, float *U, int dimX, int dimY, int dimZ);
#ifdef __cplusplus
diff --git a/main_func/regularizers_GPU/Diffus_HO/Diff4thHajiaboli_GPU.cpp b/main_func/regularizers_GPU/Diffus_HO/Diff4thHajiaboli_GPU.cpp
new file mode 100644
index 0000000..5a8c7c0
--- /dev/null
+++ b/main_func/regularizers_GPU/Diffus_HO/Diff4thHajiaboli_GPU.cpp
@@ -0,0 +1,114 @@
+#include "mex.h"
+#include <matrix.h>
+#include <math.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <stdio.h>
+#include <iostream>
+#include "Diff4th_GPU_kernel.h"
+
+/*
+ * 2D and 3D CUDA implementation of the 4th order PDE denoising model by Hajiaboli
+ *
+ * Reference :
+ * "An anisotropic fourth-order diffusion filter for image noise removal" by M. Hajiaboli
+ *
+ * Example
+ * figure;
+ * Im = double(imread('lena_gray_256.tif'))/255; % loading image
+ * u0 = Im + .05*randn(size(Im)); % adding noise
+ * u = Diff4thHajiaboli_GPU(single(u0), 0.02, 150);
+ * subplot (1,2,1); imshow(u0,[ ]); title('Noisy Image')
+ * subplot (1,2,2); imshow(u,[ ]); title('Denoised Image')
+ *
+ *
+ * Linux/Matlab compilation:
+ * compile in terminal: nvcc -Xcompiler -fPIC -shared -o Diff4th_GPU_kernel.o Diff4th_GPU_kernel.cu
+ * then compile in Matlab: mex -I/usr/local/cuda-7.5/include -L/usr/local/cuda-7.5/lib64 -lcudart Diff4thHajiaboli_GPU.cpp Diff4th_GPU_kernel.o
+ */
+
+void mexFunction(
+ int nlhs, mxArray *plhs[],
+ int nrhs, const mxArray *prhs[])
+{
+ int numdims, dimZ, size;
+ float *A, *B, *A_L, *B_L;
+ const int *dims;
+
+ numdims = mxGetNumberOfDimensions(prhs[0]);
+ dims = mxGetDimensions(prhs[0]);
+
+ float sigma = (float)mxGetScalar(prhs[1]); /* edge-preserving parameter */
+ float lambda = (float)mxGetScalar(prhs[2]); /* regularization parameter */
+ int iter = (int)mxGetScalar(prhs[3]); /* iterations number */
+
+ if (numdims == 2) {
+
+ int N, M, Z, i, j;
+ Z = 0; // for the 2D case
+ float tau = 0.01; // time step is sufficiently small for an explicit methods
+
+ /*Input data*/
+ A = (float*)mxGetData(prhs[0]);
+ N = dims[0] + 2;
+ M = dims[1] + 2;
+ A_L = (float*)mxGetData(mxCreateNumericMatrix(N, M, mxSINGLE_CLASS, mxREAL));
+ B_L = (float*)mxGetData(mxCreateNumericMatrix(N, M, mxSINGLE_CLASS, mxREAL));
+
+ /*Output data*/
+ B = (float*)mxGetData(plhs[0] = mxCreateNumericMatrix(dims[0], dims[1], mxSINGLE_CLASS, mxREAL));
+
+ // copy A to the bigger A_L with boundaries
+ #pragma omp parallel for shared(A_L, A) private(i,j)
+ for (i=0; i < N; i++) {
+ for (j=0; j < M; j++) {
+ if (((i > 0) && (i < N-1)) && ((j > 0) && (j < M-1))) A_L[i*M+j] = A[(i-1)*(dims[1])+(j-1)];
+ }}
+
+ // Running CUDA code here
+ Diff4th_GPU_kernel(A_L, B_L, N, M, Z, (float)sigma, iter, (float)tau, lambda);
+
+ // copy the processed B_L to a smaller B
+ #pragma omp parallel for shared(B_L, B) private(i,j)
+ for (i=0; i < N; i++) {
+ for (j=0; j < M; j++) {
+ if (((i > 0) && (i < N-1)) && ((j > 0) && (j < M-1))) B[(i-1)*(dims[1])+(j-1)] = B_L[i*M+j];
+ }}
+ }
+ if (numdims == 3) {
+ // 3D image denoising / regularization
+ int N, M, Z, i, j, k;
+ float tau = 0.0007; // Time Step is small for an explicit methods
+ A = (float*)mxGetData(prhs[0]);
+ N = dims[0] + 2;
+ M = dims[1] + 2;
+ Z = dims[2] + 2;
+ int N_dims[] = {N, M, Z};
+ A_L = (float*)mxGetPr(mxCreateNumericArray(3, N_dims, mxSINGLE_CLASS, mxREAL));
+ B_L = (float*)mxGetPr(mxCreateNumericArray(3, N_dims, mxSINGLE_CLASS, mxREAL));
+
+ /* output data */
+ B = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(3, dims, mxSINGLE_CLASS, mxREAL));
+
+ // copy A to the bigger A_L with boundaries
+ #pragma omp parallel for shared(A_L, A) private(i,j,k)
+ for (i=0; i < N; i++) {
+ for (j=0; j < M; j++) {
+ for (k=0; k < Z; k++) {
+ if (((i > 0) && (i < N-1)) && ((j > 0) && (j < M-1)) && ((k > 0) && (k < Z-1))) {
+ A_L[(N*M)*(k)+(i)*M+(j)] = A[(dims[0]*dims[1])*(k-1)+(i-1)*dims[1]+(j-1)];
+ }}}}
+
+ // Running CUDA kernel here for diffusivity
+ Diff4th_GPU_kernel(A_L, B_L, N, M, Z, (float)sigma, iter, (float)tau, lambda);
+
+ // copy the processed B_L to a smaller B
+ #pragma omp parallel for shared(B_L, B) private(i,j,k)
+ for (i=0; i < N; i++) {
+ for (j=0; j < M; j++) {
+ for (k=0; k < Z; k++) {
+ if (((i > 0) && (i < N-1)) && ((j > 0) && (j < M-1)) && ((k > 0) && (k < Z-1))) {
+ B[(dims[0]*dims[1])*(k-1)+(i-1)*dims[1]+(j-1)] = B_L[(N*M)*(k)+(i)*M+(j)];
+ }}}}
+ }
+} \ No newline at end of file
diff --git a/main_func/regularizers_GPU/Diffus_HO/Diff4th_GPU_kernel.cu b/main_func/regularizers_GPU/Diffus_HO/Diff4th_GPU_kernel.cu
new file mode 100644
index 0000000..178af00
--- /dev/null
+++ b/main_func/regularizers_GPU/Diffus_HO/Diff4th_GPU_kernel.cu
@@ -0,0 +1,270 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include "Diff4th_GPU_kernel.h"
+
+#define checkCudaErrors(err) __checkCudaErrors (err, __FILE__, __LINE__)
+
+inline void __checkCudaErrors(cudaError err, const char *file, const int line)
+{
+ if (cudaSuccess != err)
+ {
+ fprintf(stderr, "%s(%i) : CUDA Runtime API error %d: %s.\n",
+ file, line, (int)err, cudaGetErrorString(err));
+ exit(EXIT_FAILURE);
+ }
+}
+
+#define idivup(a, b) ( ((a)%(b) != 0) ? (a)/(b)+1 : (a)/(b) )
+#define sizeT (sizeX*sizeY*sizeZ)
+#define epsilon 0.00000001
+
+/////////////////////////////////////////////////
+// 2D Image denosing - Second Step (The second derrivative)
+__global__ void Diff4th2D_derriv(float* B, float* A, float *A0, int N, int M, float sigma, int iter, float tau, float lambda)
+{
+ float gradXXc = 0, gradYYc = 0;
+ int i = blockIdx.x*blockDim.x + threadIdx.x;
+ int j = blockIdx.y*blockDim.y + threadIdx.y;
+
+ int index = j + i*N;
+
+ if (((i < 1) || (i > N-2)) || ((j < 1) || (j > M-2))) {
+ return; }
+
+ int indexN = (j)+(i-1)*(N); if (A[indexN] == 0) indexN = index;
+ int indexS = (j)+(i+1)*(N); if (A[indexS] == 0) indexS = index;
+ int indexW = (j-1)+(i)*(N); if (A[indexW] == 0) indexW = index;
+ int indexE = (j+1)+(i)*(N); if (A[indexE] == 0) indexE = index;
+
+ gradXXc = B[indexN] + B[indexS] - 2*B[index] ;
+ gradYYc = B[indexW] + B[indexE] - 2*B[index] ;
+ A[index] = A[index] - tau*((A[index] - A0[index]) + lambda*(gradXXc + gradYYc));
+}
+
+// 2D Image denosing - The First Step
+__global__ void Diff4th2D(float* A, float* B, int N, int M, float sigma, int iter, float tau)
+{
+ float gradX, gradX_sq, gradY, gradY_sq, gradXX, gradYY, gradXY, sq_sum, xy_2, V_norm, V_orth, c, c_sq;
+
+ int i = blockIdx.x*blockDim.x + threadIdx.x;
+ int j = blockIdx.y*blockDim.y + threadIdx.y;
+
+ int index = j + i*N;
+
+ V_norm = 0.0f; V_orth = 0.0f;
+
+ if (((i < 1) || (i > N-2)) || ((j < 1) || (j > M-2))) {
+ return; }
+
+ int indexN = (j)+(i-1)*(N); if (A[indexN] == 0) indexN = index;
+ int indexS = (j)+(i+1)*(N); if (A[indexS] == 0) indexS = index;
+ int indexW = (j-1)+(i)*(N); if (A[indexW] == 0) indexW = index;
+ int indexE = (j+1)+(i)*(N); if (A[indexE] == 0) indexE = index;
+ int indexNW = (j-1)+(i-1)*(N); if (A[indexNW] == 0) indexNW = index;
+ int indexNE = (j+1)+(i-1)*(N); if (A[indexNE] == 0) indexNE = index;
+ int indexWS = (j-1)+(i+1)*(N); if (A[indexWS] == 0) indexWS = index;
+ int indexES = (j+1)+(i+1)*(N); if (A[indexES] == 0) indexES = index;
+
+ gradX = 0.5f*(A[indexN]-A[indexS]);
+ gradX_sq = gradX*gradX;
+ gradXX = A[indexN] + A[indexS] - 2*A[index];
+
+ gradY = 0.5f*(A[indexW]-A[indexE]);
+ gradY_sq = gradY*gradY;
+ gradYY = A[indexW] + A[indexE] - 2*A[index];
+
+ gradXY = 0.25f*(A[indexNW] - A[indexNE] - A[indexWS] + A[indexES]);
+ xy_2 = 2.0f*gradX*gradY*gradXY;
+ sq_sum = gradX_sq + gradY_sq;
+
+ if (sq_sum <= epsilon) {
+ V_norm = (gradXX*gradX_sq + xy_2 + gradYY*gradY_sq)/epsilon;
+ V_orth = (gradXX*gradY_sq - xy_2 + gradYY*gradX_sq)/epsilon; }
+ else {
+ V_norm = (gradXX*gradX_sq + xy_2 + gradYY*gradY_sq)/sq_sum;
+ V_orth = (gradXX*gradY_sq - xy_2 + gradYY*gradX_sq)/sq_sum; }
+
+ c = 1.0f/(1.0f + sq_sum/sigma);
+ c_sq = c*c;
+ B[index] = c_sq*V_norm + c*V_orth;
+}
+
+/////////////////////////////////////////////////
+// 3D data parocerssing
+__global__ void Diff4th3D_derriv(float *B, float *A, float *A0, int N, int M, int Z, float sigma, int iter, float tau, float lambda)
+{
+ float gradXXc = 0, gradYYc = 0, gradZZc = 0;
+ int xIndex = blockDim.x * blockIdx.x + threadIdx.x;
+ int yIndex = blockDim.y * blockIdx.y + threadIdx.y;
+ int zIndex = blockDim.z * blockIdx.z + threadIdx.z;
+
+ int index = xIndex + M*yIndex + N*M*zIndex;
+
+ if (((xIndex < 1) || (xIndex > N-2)) || ((yIndex < 1) || (yIndex > M-2)) || ((zIndex < 1) || (zIndex > Z-2))) {
+ return; }
+
+ int indexN = (xIndex-1) + M*yIndex + N*M*zIndex; if (A[indexN] == 0) indexN = index;
+ int indexS = (xIndex+1) + M*yIndex + N*M*zIndex; if (A[indexS] == 0) indexS = index;
+ int indexW = xIndex + M*(yIndex-1) + N*M*zIndex; if (A[indexW] == 0) indexW = index;
+ int indexE = xIndex + M*(yIndex+1) + N*M*zIndex; if (A[indexE] == 0) indexE = index;
+ int indexU = xIndex + M*yIndex + N*M*(zIndex-1); if (A[indexU] == 0) indexU = index;
+ int indexD = xIndex + M*yIndex + N*M*(zIndex+1); if (A[indexD] == 0) indexD = index;
+
+ gradXXc = B[indexN] + B[indexS] - 2*B[index] ;
+ gradYYc = B[indexW] + B[indexE] - 2*B[index] ;
+ gradZZc = B[indexU] + B[indexD] - 2*B[index] ;
+
+ A[index] = A[index] - tau*((A[index] - A0[index]) + lambda*(gradXXc + gradYYc + gradZZc));
+}
+
+__global__ void Diff4th3D(float* A, float* B, int N, int M, int Z, float sigma, int iter, float tau)
+{
+ float gradX, gradX_sq, gradY, gradY_sq, gradZ, gradZ_sq, gradXX, gradYY, gradZZ, gradXY, gradXZ, gradYZ, sq_sum, xy_2, xyz_1, xyz_2, V_norm, V_orth, c, c_sq;
+
+ int xIndex = blockDim.x * blockIdx.x + threadIdx.x;
+ int yIndex = blockDim.y * blockIdx.y + threadIdx.y;
+ int zIndex = blockDim.z * blockIdx.z + threadIdx.z;
+
+ int index = xIndex + M*yIndex + N*M*zIndex;
+ V_norm = 0.0f; V_orth = 0.0f;
+
+ if (((xIndex < 1) || (xIndex > N-2)) || ((yIndex < 1) || (yIndex > M-2)) || ((zIndex < 1) || (zIndex > Z-2))) {
+ return; }
+
+ B[index] = 0;
+
+ int indexN = (xIndex-1) + M*yIndex + N*M*zIndex; if (A[indexN] == 0) indexN = index;
+ int indexS = (xIndex+1) + M*yIndex + N*M*zIndex; if (A[indexS] == 0) indexS = index;
+ int indexW = xIndex + M*(yIndex-1) + N*M*zIndex; if (A[indexW] == 0) indexW = index;
+ int indexE = xIndex + M*(yIndex+1) + N*M*zIndex; if (A[indexE] == 0) indexE = index;
+ int indexU = xIndex + M*yIndex + N*M*(zIndex-1); if (A[indexU] == 0) indexU = index;
+ int indexD = xIndex + M*yIndex + N*M*(zIndex+1); if (A[indexD] == 0) indexD = index;
+
+ int indexNW = (xIndex-1) + M*(yIndex-1) + N*M*zIndex; if (A[indexNW] == 0) indexNW = index;
+ int indexNE = (xIndex-1) + M*(yIndex+1) + N*M*zIndex; if (A[indexNE] == 0) indexNE = index;
+ int indexWS = (xIndex+1) + M*(yIndex-1) + N*M*zIndex; if (A[indexWS] == 0) indexWS = index;
+ int indexES = (xIndex+1) + M*(yIndex+1) + N*M*zIndex; if (A[indexES] == 0) indexES = index;
+
+ int indexUW = (xIndex-1) + M*(yIndex) + N*M*(zIndex-1); if (A[indexUW] == 0) indexUW = index;
+ int indexUE = (xIndex+1) + M*(yIndex) + N*M*(zIndex-1); if (A[indexUE] == 0) indexUE = index;
+ int indexDW = (xIndex-1) + M*(yIndex) + N*M*(zIndex+1); if (A[indexDW] == 0) indexDW = index;
+ int indexDE = (xIndex+1) + M*(yIndex) + N*M*(zIndex+1); if (A[indexDE] == 0) indexDE = index;
+
+ int indexUN = (xIndex) + M*(yIndex-1) + N*M*(zIndex-1); if (A[indexUN] == 0) indexUN = index;
+ int indexUS = (xIndex) + M*(yIndex+1) + N*M*(zIndex-1); if (A[indexUS] == 0) indexUS = index;
+ int indexDN = (xIndex) + M*(yIndex-1) + N*M*(zIndex+1); if (A[indexDN] == 0) indexDN = index;
+ int indexDS = (xIndex) + M*(yIndex+1) + N*M*(zIndex+1); if (A[indexDS] == 0) indexDS = index;
+
+ gradX = 0.5f*(A[indexN]-A[indexS]);
+ gradX_sq = gradX*gradX;
+ gradXX = A[indexN] + A[indexS] - 2*A[index];
+
+ gradY = 0.5f*(A[indexW]-A[indexE]);
+ gradY_sq = gradY*gradY;
+ gradYY = A[indexW] + A[indexE] - 2*A[index];
+
+ gradZ = 0.5f*(A[indexU]-A[indexD]);
+ gradZ_sq = gradZ*gradZ;
+ gradZZ = A[indexU] + A[indexD] - 2*A[index];
+
+ gradXY = 0.25f*(A[indexNW] - A[indexNE] - A[indexWS] + A[indexES]);
+ gradXZ = 0.25f*(A[indexUW] - A[indexUE] - A[indexDW] + A[indexDE]);
+ gradYZ = 0.25f*(A[indexUN] - A[indexUS] - A[indexDN] + A[indexDS]);
+
+ xy_2 = 2.0f*gradX*gradY*gradXY;
+ xyz_1 = 2.0f*gradX*gradZ*gradXZ;
+ xyz_2 = 2.0f*gradY*gradZ*gradYZ;
+
+ sq_sum = gradX_sq + gradY_sq + gradZ_sq;
+
+ if (sq_sum <= epsilon) {
+ V_norm = (gradXX*gradX_sq + gradYY*gradY_sq + gradZZ*gradZ_sq + xy_2 + xyz_1 + xyz_2)/epsilon;
+ V_orth = ((gradY_sq + gradZ_sq)*gradXX + (gradX_sq + gradZ_sq)*gradYY + (gradX_sq + gradY_sq)*gradZZ - xy_2 - xyz_1 - xyz_2)/epsilon; }
+ else {
+ V_norm = (gradXX*gradX_sq + gradYY*gradY_sq + gradZZ*gradZ_sq + xy_2 + xyz_1 + xyz_2)/sq_sum;
+ V_orth = ((gradY_sq + gradZ_sq)*gradXX + (gradX_sq + gradZ_sq)*gradYY + (gradX_sq + gradY_sq)*gradZZ - xy_2 - xyz_1 - xyz_2)/sq_sum; }
+
+ c = 1;
+ if ((1.0f + sq_sum/sigma) != 0.0f) {c = 1.0f/(1.0f + sq_sum/sigma);}
+
+ c_sq = c*c;
+ B[index] = c_sq*V_norm + c*V_orth;
+}
+
+/******************************************************/
+/********* HOST FUNCTION*************/
+extern "C" void Diff4th_GPU_kernel(float* A, float* B, int N, int M, int Z, float sigma, int iter, float tau, float lambda)
+{
+ int deviceCount = -1; // number of devices
+ cudaGetDeviceCount(&deviceCount);
+ if (deviceCount == 0) {
+ fprintf(stderr, "No CUDA devices found\n");
+ return;
+ }
+
+ int BLKXSIZE, BLKYSIZE,BLKZSIZE;
+ float *Ad, *Bd, *Cd;
+ sigma = sigma*sigma;
+
+ if (Z == 0){
+ // 4th order diffusion for 2D case
+ BLKXSIZE = 8;
+ BLKYSIZE = 16;
+
+ dim3 dimBlock(BLKXSIZE,BLKYSIZE);
+ dim3 dimGrid(idivup(N,BLKXSIZE), idivup(M,BLKYSIZE));
+
+ checkCudaErrors(cudaMalloc((void**)&Ad,N*M*sizeof(float)));
+ checkCudaErrors(cudaMalloc((void**)&Bd,N*M*sizeof(float)));
+ checkCudaErrors(cudaMalloc((void**)&Cd,N*M*sizeof(float)));
+
+ checkCudaErrors(cudaMemcpy(Ad,A,N*M*sizeof(float),cudaMemcpyHostToDevice));
+ checkCudaErrors(cudaMemcpy(Bd,A,N*M*sizeof(float),cudaMemcpyHostToDevice));
+ checkCudaErrors(cudaMemcpy(Cd,A,N*M*sizeof(float),cudaMemcpyHostToDevice));
+
+ int n = 1;
+ while (n <= iter) {
+ Diff4th2D<<<dimGrid,dimBlock>>>(Bd, Cd, N, M, sigma, iter, tau);
+ cudaDeviceSynchronize();
+ checkCudaErrors( cudaPeekAtLastError() );
+ Diff4th2D_derriv<<<dimGrid,dimBlock>>>(Cd, Bd, Ad, N, M, sigma, iter, tau, lambda);
+ cudaDeviceSynchronize();
+ checkCudaErrors( cudaPeekAtLastError() );
+ n++;
+ }
+ checkCudaErrors(cudaMemcpy(B,Bd,N*M*sizeof(float),cudaMemcpyDeviceToHost));
+ cudaFree(Ad); cudaFree(Bd); cudaFree(Cd);
+ }
+
+ if (Z != 0){
+ // 4th order diffusion for 3D case
+ BLKXSIZE = 8;
+ BLKYSIZE = 8;
+ BLKZSIZE = 8;
+
+ dim3 dimBlock(BLKXSIZE,BLKYSIZE,BLKZSIZE);
+ dim3 dimGrid(idivup(N,BLKXSIZE), idivup(M,BLKYSIZE),idivup(Z,BLKXSIZE));
+
+ checkCudaErrors(cudaMalloc((void**)&Ad,N*M*Z*sizeof(float)));
+ checkCudaErrors(cudaMalloc((void**)&Bd,N*M*Z*sizeof(float)));
+ checkCudaErrors(cudaMalloc((void**)&Cd,N*M*Z*sizeof(float)));
+
+ checkCudaErrors(cudaMemcpy(Ad,A,N*M*Z*sizeof(float),cudaMemcpyHostToDevice));
+ checkCudaErrors(cudaMemcpy(Bd,A,N*M*Z*sizeof(float),cudaMemcpyHostToDevice));
+ checkCudaErrors(cudaMemcpy(Cd,A,N*M*Z*sizeof(float),cudaMemcpyHostToDevice));
+
+ int n = 1;
+ while (n <= iter) {
+ Diff4th3D<<<dimGrid,dimBlock>>>(Bd, Cd, N, M, Z, sigma, iter, tau);
+ cudaDeviceSynchronize();
+ checkCudaErrors( cudaPeekAtLastError() );
+ Diff4th3D_derriv<<<dimGrid,dimBlock>>>(Cd, Bd, Ad, N, M, Z, sigma, iter, tau, lambda);
+ cudaDeviceSynchronize();
+ checkCudaErrors( cudaPeekAtLastError() );
+ n++;
+ }
+ checkCudaErrors(cudaMemcpy(B,Bd,N*M*Z*sizeof(float),cudaMemcpyDeviceToHost));
+ cudaFree(Ad); cudaFree(Bd); cudaFree(Cd);
+ }
+} \ No newline at end of file
diff --git a/main_func/regularizers_GPU/Diffus_HO/Diff4th_GPU_kernel.h b/main_func/regularizers_GPU/Diffus_HO/Diff4th_GPU_kernel.h
new file mode 100644
index 0000000..cfbb45a
--- /dev/null
+++ b/main_func/regularizers_GPU/Diffus_HO/Diff4th_GPU_kernel.h
@@ -0,0 +1,6 @@
+#ifndef __DIFF_HO_H_
+#define __DIFF_HO_H_
+
+extern "C" void Diff4th_GPU_kernel(float* A, float* B, int N, int M, int Z, float sigma, int iter, float tau, float lambda);
+
+#endif
diff --git a/main_func/regularizers_GPU/NL_Regul/NLM_GPU.cpp b/main_func/regularizers_GPU/NL_Regul/NLM_GPU.cpp
new file mode 100644
index 0000000..ff0cc90
--- /dev/null
+++ b/main_func/regularizers_GPU/NL_Regul/NLM_GPU.cpp
@@ -0,0 +1,171 @@
+#include "mex.h"
+#include <matrix.h>
+#include <math.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <stdio.h>
+#include <iostream>
+#include "NLM_GPU_kernel.h"
+
+/* CUDA implementation of the patch-based (PB) regularization for 2D and 3D images/volumes
+ * This method finds self-similar patches in data and performs one fixed point iteration to mimimize the PB penalty function
+ *
+ * References: 1. Yang Z. & Jacob M. "Nonlocal Regularization of Inverse Problems"
+ * 2. Kazantsev D. at. all "4D-CT reconstruction with unified spatial-temporal patch-based regularization"
+ *
+ * Input Parameters (mandatory):
+ * 1. Image/volume (2D/3D)
+ * 2. ratio of the searching window (e.g. 3 = (2*3+1) = 7 pixels window)
+ * 3. ratio of the similarity window (e.g. 1 = (2*1+1) = 3 pixels window)
+ * 4. h - parameter for the PB penalty function
+ * 5. lambda - regularization parameter
+
+ * Output:
+ * 1. regularized (denoised) Image/volume (N x N x N)
+ *
+ * In matlab check what kind of GPU you have with "gpuDevice" command,
+ * then set your ComputeCapability, here I use -arch compute_35
+ *
+ * Quick 2D denoising example in Matlab:
+ Im = double(imread('lena_gray_256.tif'))/255; % loading image
+ u0 = Im + .03*randn(size(Im)); u0(u0<0) = 0; % adding noise
+ ImDen = NLM_GPU(single(u0), 3, 2, 0.15, 1);
+
+ * Linux/Matlab compilation:
+ * compile in terminal: nvcc -Xcompiler -fPIC -shared -o NLM_GPU_kernel.o NLM_GPU_kernel.cu
+ * then compile in Matlab: mex -I/usr/local/cuda-7.5/include -L/usr/local/cuda-7.5/lib64 -lcudart NLM_GPU.cpp NLM_GPU_kernel.o
+ *
+ * D. Kazantsev
+ * 2014-17
+ * Harwell/Manchester UK
+ */
+
+float pad_crop(float *A, float *Ap, int OldSizeX, int OldSizeY, int OldSizeZ, int NewSizeX, int NewSizeY, int NewSizeZ, int padXY, int switchpad_crop);
+
+void mexFunction(
+ int nlhs, mxArray *plhs[],
+ int nrhs, const mxArray *prhs[])
+{
+ int N, M, Z, i_n, j_n, k_n, numdims, SearchW, SimilW, SearchW_real, padXY, newsizeX, newsizeY, newsizeZ, switchpad_crop, count, SearchW_full, SimilW_full;
+ const int *dims;
+ float *A, *B=NULL, *Ap=NULL, *Bp=NULL, *Eucl_Vec, h, h2, lambda, val, denh2;
+
+ numdims = mxGetNumberOfDimensions(prhs[0]);
+ dims = mxGetDimensions(prhs[0]);
+
+ N = dims[0];
+ M = dims[1];
+ Z = dims[2];
+
+ if ((numdims < 2) || (numdims > 3)) {mexErrMsgTxt("The input should be 2D image or 3D volume");}
+ if (mxGetClassID(prhs[0]) != mxSINGLE_CLASS) {mexErrMsgTxt("The input in single precision is required"); }
+
+ if(nrhs != 5) mexErrMsgTxt("Five inputs reqired: Image(2D,3D), SearchW, SimilW, Threshold, Regularization parameter");
+
+ /*Handling inputs*/
+ A = (float *) mxGetData(prhs[0]); /* the image to regularize/filter */
+ SearchW_real = (int) mxGetScalar(prhs[1]); /* the searching window ratio */
+ SimilW = (int) mxGetScalar(prhs[2]); /* the similarity window ratio */
+ h = (float) mxGetScalar(prhs[3]); /* parameter for the PB filtering function */
+ lambda = (float) mxGetScalar(prhs[4]);
+
+ if (h <= 0) mexErrMsgTxt("Parmeter for the PB penalty function should be > 0");
+
+ SearchW = SearchW_real + 2*SimilW;
+
+ SearchW_full = 2*SearchW + 1; /* the full searching window size */
+ SimilW_full = 2*SimilW + 1; /* the full similarity window size */
+ h2 = h*h;
+
+ padXY = SearchW + 2*SimilW; /* padding sizes */
+ newsizeX = N + 2*(padXY); /* the X size of the padded array */
+ newsizeY = M + 2*(padXY); /* the Y size of the padded array */
+ newsizeZ = Z + 2*(padXY); /* the Z size of the padded array */
+ int N_dims[] = {newsizeX, newsizeY, newsizeZ};
+
+ /******************************2D case ****************************/
+ if (numdims == 2) {
+ /*Handling output*/
+ B = (float*)mxGetData(plhs[0] = mxCreateNumericMatrix(N, M, mxSINGLE_CLASS, mxREAL));
+ /*allocating memory for the padded arrays */
+ Ap = (float*)mxGetData(mxCreateNumericMatrix(newsizeX, newsizeY, mxSINGLE_CLASS, mxREAL));
+ Bp = (float*)mxGetData(mxCreateNumericMatrix(newsizeX, newsizeY, mxSINGLE_CLASS, mxREAL));
+ Eucl_Vec = (float*)mxGetData(mxCreateNumericMatrix(SimilW_full*SimilW_full, 1, mxSINGLE_CLASS, mxREAL));
+
+ /*Gaussian kernel */
+ count = 0;
+ for(i_n=-SimilW; i_n<=SimilW; i_n++) {
+ for(j_n=-SimilW; j_n<=SimilW; j_n++) {
+ val = (float)(i_n*i_n + j_n*j_n)/(2*SimilW*SimilW);
+ Eucl_Vec[count] = exp(-val);
+ count = count + 1;
+ }} /*main neighb loop */
+
+ /**************************************************************************/
+ /*Perform padding of image A to the size of [newsizeX * newsizeY] */
+ switchpad_crop = 0; /*padding*/
+ pad_crop(A, Ap, M, N, 0, newsizeY, newsizeX, 0, padXY, switchpad_crop);
+
+ /* Do PB regularization with the padded array */
+ NLM_GPU_kernel(Ap, Bp, Eucl_Vec, newsizeY, newsizeX, 0, numdims, SearchW, SimilW, SearchW_real, (float)h2, (float)lambda);
+
+ switchpad_crop = 1; /*cropping*/
+ pad_crop(Bp, B, M, N, 0, newsizeY, newsizeX, 0, padXY, switchpad_crop);
+ }
+ else
+ {
+ /******************************3D case ****************************/
+ /*Handling output*/
+ B = (float*)mxGetPr(plhs[0] = mxCreateNumericArray(3, dims, mxSINGLE_CLASS, mxREAL));
+ /*allocating memory for the padded arrays */
+ Ap = (float*)mxGetPr(mxCreateNumericArray(3, N_dims, mxSINGLE_CLASS, mxREAL));
+ Bp = (float*)mxGetPr(mxCreateNumericArray(3, N_dims, mxSINGLE_CLASS, mxREAL));
+ Eucl_Vec = (float*)mxGetData(mxCreateNumericMatrix(SimilW_full*SimilW_full*SimilW_full, 1, mxSINGLE_CLASS, mxREAL));
+
+ /*Gaussian kernel */
+ count = 0;
+ for(i_n=-SimilW; i_n<=SimilW; i_n++) {
+ for(j_n=-SimilW; j_n<=SimilW; j_n++) {
+ for(k_n=-SimilW; k_n<=SimilW; k_n++) {
+ val = (float)(i_n*i_n + j_n*j_n + k_n*k_n)/(2*SimilW*SimilW*SimilW);
+ Eucl_Vec[count] = exp(-val);
+ count = count + 1;
+ }}} /*main neighb loop */
+ /**************************************************************************/
+ /*Perform padding of image A to the size of [newsizeX * newsizeY * newsizeZ] */
+ switchpad_crop = 0; /*padding*/
+ pad_crop(A, Ap, M, N, Z, newsizeY, newsizeX, newsizeZ, padXY, switchpad_crop);
+
+ /* Do PB regularization with the padded array */
+ NLM_GPU_kernel(Ap, Bp, Eucl_Vec, newsizeY, newsizeX, newsizeZ, numdims, SearchW, SimilW, SearchW_real, (float)h2, (float)lambda);
+
+ switchpad_crop = 1; /*cropping*/
+ pad_crop(Bp, B, M, N, Z, newsizeY, newsizeX, newsizeZ, padXY, switchpad_crop);
+ } /*end else ndims*/
+}
+
+float pad_crop(float *A, float *Ap, int OldSizeX, int OldSizeY, int OldSizeZ, int NewSizeX, int NewSizeY, int NewSizeZ, int padXY, int switchpad_crop)
+{
+ /* padding-cropping function */
+ int i,j,k;
+ if (NewSizeZ > 1) {
+ for (i=0; i < NewSizeX; i++) {
+ for (j=0; j < NewSizeY; j++) {
+ for (k=0; k < NewSizeZ; k++) {
+ if (((i >= padXY) && (i < NewSizeX-padXY)) && ((j >= padXY) && (j < NewSizeY-padXY)) && ((k >= padXY) && (k < NewSizeZ-padXY))) {
+ if (switchpad_crop == 0) Ap[NewSizeX*NewSizeY*k + i*NewSizeY+j] = A[OldSizeX*OldSizeY*(k - padXY) + (i-padXY)*(OldSizeY)+(j-padXY)];
+ else Ap[OldSizeX*OldSizeY*(k - padXY) + (i-padXY)*(OldSizeY)+(j-padXY)] = A[NewSizeX*NewSizeY*k + i*NewSizeY+j];
+ }
+ }}}
+ }
+ else {
+ for (i=0; i < NewSizeX; i++) {
+ for (j=0; j < NewSizeY; j++) {
+ if (((i >= padXY) && (i < NewSizeX-padXY)) && ((j >= padXY) && (j < NewSizeY-padXY))) {
+ if (switchpad_crop == 0) Ap[i*NewSizeY+j] = A[(i-padXY)*(OldSizeY)+(j-padXY)];
+ else Ap[(i-padXY)*(OldSizeY)+(j-padXY)] = A[i*NewSizeY+j];
+ }
+ }}
+ }
+ return *Ap;
+} \ No newline at end of file
diff --git a/main_func/regularizers_GPU/NL_Regul/NLM_GPU_kernel.cu b/main_func/regularizers_GPU/NL_Regul/NLM_GPU_kernel.cu
new file mode 100644
index 0000000..17da3a8
--- /dev/null
+++ b/main_func/regularizers_GPU/NL_Regul/NLM_GPU_kernel.cu
@@ -0,0 +1,239 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include "NLM_GPU_kernel.h"
+
+#define checkCudaErrors(err) __checkCudaErrors (err, __FILE__, __LINE__)
+
+inline void __checkCudaErrors(cudaError err, const char *file, const int line)
+{
+ if (cudaSuccess != err)
+ {
+ fprintf(stderr, "%s(%i) : CUDA Runtime API error %d: %s.\n",
+ file, line, (int)err, cudaGetErrorString(err));
+ exit(EXIT_FAILURE);
+ }
+}
+
+extern __shared__ float sharedmem[];
+
+// run PB den kernel here
+__global__ void NLM_kernel(float *Ad, float* Bd, float *Eucl_Vec_d, int N, int M, int Z, int SearchW, int SimilW, int SearchW_real, int SearchW_full, int SimilW_full, int padXY, float h2, float lambda, dim3 imagedim, dim3 griddim, dim3 kerneldim, dim3 sharedmemdim, int nUpdatePerThread, float neighborsize)
+{
+
+ int i1, j1, k1, i2, j2, k2, i3, j3, k3, i_l, j_l, k_l, count;
+ float value, Weight_norm, normsum, Weight;
+
+ int bidx = blockIdx.x;
+ int bidy = blockIdx.y%griddim.y;
+ int bidz = (int)((blockIdx.y)/griddim.y);
+
+ // global index for block endpoint
+ int beidx = __mul24(bidx,blockDim.x);
+ int beidy = __mul24(bidy,blockDim.y);
+ int beidz = __mul24(bidz,blockDim.z);
+
+ int tid = __mul24(threadIdx.z,__mul24(blockDim.x,blockDim.y)) +
+ __mul24(threadIdx.y,blockDim.x) + threadIdx.x;
+
+ #ifdef __DEVICE_EMULATION__
+ printf("tid : %d", tid);
+ #endif
+
+ // update shared memory
+ int nthreads = blockDim.x*blockDim.y*blockDim.z;
+ int sharedMemSize = sharedmemdim.x * sharedmemdim.y * sharedmemdim.z;
+ for(int i=0; i<nUpdatePerThread; i++)
+ {
+ int sid = tid + i*nthreads; // index in shared memory
+ if (sid < sharedMemSize)
+ {
+ // global x/y/z index in volume
+ int gidx, gidy, gidz;
+ int sidx, sidy, sidz, tid;
+
+ sidz = sid / (sharedmemdim.x*sharedmemdim.y);
+ tid = sid - sidz*(sharedmemdim.x*sharedmemdim.y);
+ sidy = tid / (sharedmemdim.x);
+ sidx = tid - sidy*(sharedmemdim.x);
+
+ gidx = (int)sidx - (int)kerneldim.x + (int)beidx;
+ gidy = (int)sidy - (int)kerneldim.y + (int)beidy;
+ gidz = (int)sidz - (int)kerneldim.z + (int)beidz;
+
+ // Neumann boundary condition
+ int cx = (int) min(max(0,gidx),imagedim.x-1);
+ int cy = (int) min(max(0,gidy),imagedim.y-1);
+ int cz = (int) min(max(0,gidz),imagedim.z-1);
+
+ int gid = cz*imagedim.x*imagedim.y + cy*imagedim.x + cx;
+
+ sharedmem[sid] = Ad[gid];
+ }
+ }
+ __syncthreads();
+
+ // global index of the current voxel in the input volume
+ int idx = beidx + threadIdx.x;
+ int idy = beidy + threadIdx.y;
+ int idz = beidz + threadIdx.z;
+
+ if (Z == 1) {
+ /* 2D case */
+ /*checking boundaries to be within the image and avoid padded spaces */
+ if( idx >= padXY && idx < (imagedim.x - padXY) &&
+ idy >= padXY && idy < (imagedim.y - padXY))
+ {
+ int i_centr = threadIdx.x + (SearchW); /*indices of the centrilized (main) pixel */
+ int j_centr = threadIdx.y + (SearchW); /*indices of the centrilized (main) pixel */
+
+ if ((i_centr > 0) && (i_centr < N) && (j_centr > 0) && (j_centr < M)) {
+
+ Weight_norm = 0; value = 0.0;
+ /* Massive Search window loop */
+ for(i1 = i_centr - SearchW_real ; i1 <= i_centr + SearchW_real; i1++) {
+ for(j1 = j_centr - SearchW_real ; j1<= j_centr + SearchW_real ; j1++) {
+ /* if inside the searching window */
+ count = 0; normsum = 0.0;
+ for(i_l=-SimilW; i_l<=SimilW; i_l++) {
+ for(j_l=-SimilW; j_l<=SimilW; j_l++) {
+ i2 = i1+i_l; j2 = j1+j_l;
+ i3 = i_centr+i_l; j3 = j_centr+j_l; /*coordinates of the inner patch loop */
+ if ((i2 > 0) && (i2 < N) && (j2 > 0) && (j2 < M)) {
+ if ((i3 > 0) && (i3 < N) && (j3 > 0) && (j3 < M)) {
+ normsum += Eucl_Vec_d[count]*pow((sharedmem[(j3)*sharedmemdim.x+(i3)] - sharedmem[j2*sharedmemdim.x+i2]), 2);
+ }}
+ count++;
+ }}
+ if (normsum != 0) Weight = (expf(-normsum/h2));
+ else Weight = 0.0;
+ Weight_norm += Weight;
+ value += sharedmem[j1*sharedmemdim.x+i1]*Weight;
+ }}
+
+ if (Weight_norm != 0) Bd[idz*imagedim.x*imagedim.y + idy*imagedim.x + idx] = value/Weight_norm;
+ else Bd[idz*imagedim.x*imagedim.y + idy*imagedim.x + idx] = Ad[idz*imagedim.x*imagedim.y + idy*imagedim.x + idx];
+ }
+ } /*boundary conditions end*/
+ }
+ else {
+ /*3D case*/
+ /*checking boundaries to be within the image and avoid padded spaces */
+ if( idx >= padXY && idx < (imagedim.x - padXY) &&
+ idy >= padXY && idy < (imagedim.y - padXY) &&
+ idz >= padXY && idz < (imagedim.z - padXY) )
+ {
+ int i_centr = threadIdx.x + SearchW; /*indices of the centrilized (main) pixel */
+ int j_centr = threadIdx.y + SearchW; /*indices of the centrilized (main) pixel */
+ int k_centr = threadIdx.z + SearchW; /*indices of the centrilized (main) pixel */
+
+ if ((i_centr > 0) && (i_centr < N) && (j_centr > 0) && (j_centr < M) && (k_centr > 0) && (k_centr < Z)) {
+
+ Weight_norm = 0; value = 0.0;
+ /* Massive Search window loop */
+ for(i1 = i_centr - SearchW_real ; i1 <= i_centr + SearchW_real; i1++) {
+ for(j1 = j_centr - SearchW_real ; j1<= j_centr + SearchW_real ; j1++) {
+ for(k1 = k_centr - SearchW_real ; k1<= k_centr + SearchW_real ; k1++) {
+ /* if inside the searching window */
+ count = 0; normsum = 0.0;
+ for(i_l=-SimilW; i_l<=SimilW; i_l++) {
+ for(j_l=-SimilW; j_l<=SimilW; j_l++) {
+ for(k_l=-SimilW; k_l<=SimilW; k_l++) {
+ i2 = i1+i_l; j2 = j1+j_l; k2 = k1+k_l;
+ i3 = i_centr+i_l; j3 = j_centr+j_l; k3 = k_centr+k_l; /*coordinates of the inner patch loop */
+ if ((i2 > 0) && (i2 < N) && (j2 > 0) && (j2 < M) && (k2 > 0) && (k2 < Z)) {
+ if ((i3 > 0) && (i3 < N) && (j3 > 0) && (j3 < M) && (k3 > 0) && (k3 < Z)) {
+ normsum += Eucl_Vec_d[count]*pow((sharedmem[(k3)*sharedmemdim.x*sharedmemdim.y + (j3)*sharedmemdim.x+(i3)] - sharedmem[(k2)*sharedmemdim.x*sharedmemdim.y + j2*sharedmemdim.x+i2]), 2);
+ }}
+ count++;
+ }}}
+ if (normsum != 0) Weight = (expf(-normsum/h2));
+ else Weight = 0.0;
+ Weight_norm += Weight;
+ value += sharedmem[k1*sharedmemdim.x*sharedmemdim.y + j1*sharedmemdim.x+i1]*Weight;
+ }}} /* BIG search window loop end*/
+
+
+ if (Weight_norm != 0) Bd[idz*imagedim.x*imagedim.y + idy*imagedim.x + idx] = value/Weight_norm;
+ else Bd[idz*imagedim.x*imagedim.y + idy*imagedim.x + idx] = Ad[idz*imagedim.x*imagedim.y + idy*imagedim.x + idx];
+ }
+ } /* boundary conditions end */
+ }
+}
+
+/////////////////////////////////////////////////
+// HOST FUNCTION
+extern "C" void NLM_GPU_kernel(float *A, float* B, float *Eucl_Vec, int N, int M, int Z, int dimension, int SearchW, int SimilW, int SearchW_real, float h2, float lambda)
+{
+ int deviceCount = -1; // number of devices
+ cudaGetDeviceCount(&deviceCount);
+ if (deviceCount == 0) {
+ fprintf(stderr, "No CUDA devices found\n");
+ return;
+ }
+
+// cudaDeviceReset();
+
+ int padXY, SearchW_full, SimilW_full, blockWidth, blockHeight, blockDepth, nBlockX, nBlockY, nBlockZ, kernel_depth;
+ float *Ad, *Bd, *Eucl_Vec_d;
+
+ if (dimension == 2) {
+ blockWidth = 16;
+ blockHeight = 16;
+ blockDepth = 1;
+ Z = 1;
+ kernel_depth = 0;
+ }
+ else {
+ blockWidth = 8;
+ blockHeight = 8;
+ blockDepth = 8;
+ kernel_depth = SearchW;
+ }
+
+ // compute how many blocks are needed
+ nBlockX = ceil((float)N / (float)blockWidth);
+ nBlockY = ceil((float)M / (float)blockHeight);
+ nBlockZ = ceil((float)Z / (float)blockDepth);
+
+ dim3 dimGrid(nBlockX,nBlockY*nBlockZ);
+ dim3 dimBlock(blockWidth, blockHeight, blockDepth);
+ dim3 imagedim(N,M,Z);
+ dim3 griddim(nBlockX,nBlockY,nBlockZ);
+
+ dim3 kerneldim(SearchW,SearchW,kernel_depth);
+ dim3 sharedmemdim((SearchW*2)+blockWidth,(SearchW*2)+blockHeight,(kernel_depth*2)+blockDepth);
+ int sharedmemsize = sizeof(float)*sharedmemdim.x*sharedmemdim.y*sharedmemdim.z;
+ int updateperthread = ceil((float)(sharedmemdim.x*sharedmemdim.y*sharedmemdim.z)/(float)(blockWidth*blockHeight*blockDepth));
+ float neighborsize = (2*SearchW+1)*(2*SearchW+1)*(2*kernel_depth+1);
+
+ padXY = SearchW + 2*SimilW; /* padding sizes */
+
+ SearchW_full = 2*SearchW + 1; /* the full searching window size */
+ SimilW_full = 2*SimilW + 1; /* the full similarity window size */
+
+ /*allocate space for images on device*/
+ checkCudaErrors( cudaMalloc((void**)&Ad,N*M*Z*sizeof(float)) );
+ checkCudaErrors( cudaMalloc((void**)&Bd,N*M*Z*sizeof(float)) );
+ /*allocate space for vectors on device*/
+ if (dimension == 2) {
+ checkCudaErrors( cudaMalloc((void**)&Eucl_Vec_d,SimilW_full*SimilW_full*sizeof(float)) );
+ checkCudaErrors( cudaMemcpy(Eucl_Vec_d,Eucl_Vec,SimilW_full*SimilW_full*sizeof(float),cudaMemcpyHostToDevice) );
+ }
+ else {
+ checkCudaErrors( cudaMalloc((void**)&Eucl_Vec_d,SimilW_full*SimilW_full*SimilW_full*sizeof(float)) );
+ checkCudaErrors( cudaMemcpy(Eucl_Vec_d,Eucl_Vec,SimilW_full*SimilW_full*SimilW_full*sizeof(float),cudaMemcpyHostToDevice) );
+ }
+
+ /* copy data from the host to device */
+ checkCudaErrors( cudaMemcpy(Ad,A,N*M*Z*sizeof(float),cudaMemcpyHostToDevice) );
+
+ // Run CUDA kernel here
+ NLM_kernel<<<dimGrid,dimBlock,sharedmemsize>>>(Ad, Bd, Eucl_Vec_d, M, N, Z, SearchW, SimilW, SearchW_real, SearchW_full, SimilW_full, padXY, h2, lambda, imagedim, griddim, kerneldim, sharedmemdim, updateperthread, neighborsize);
+
+ checkCudaErrors( cudaPeekAtLastError() );
+// gpuErrchk( cudaDeviceSynchronize() );
+
+ checkCudaErrors( cudaMemcpy(B,Bd,N*M*Z*sizeof(float),cudaMemcpyDeviceToHost) );
+ cudaFree(Ad); cudaFree(Bd); cudaFree(Eucl_Vec_d);
+}
diff --git a/main_func/regularizers_GPU/NL_Regul/NLM_GPU_kernel.h b/main_func/regularizers_GPU/NL_Regul/NLM_GPU_kernel.h
new file mode 100644
index 0000000..bc9d4a3
--- /dev/null
+++ b/main_func/regularizers_GPU/NL_Regul/NLM_GPU_kernel.h
@@ -0,0 +1,6 @@
+#ifndef __NLMREG_KERNELS_H_
+#define __NLMREG_KERNELS_H_
+
+extern "C" void NLM_GPU_kernel(float *A, float* B, float *Eucl_Vec, int N, int M, int Z, int dimension, int SearchW, int SimilW, int SearchW_real, float denh2, float lambda);
+
+#endif