diff options
-rw-r--r-- | Core/regularizers_CPU/FGP_TV_core.c | 481 | ||||
-rw-r--r-- | Core/regularizers_CPU/FGP_TV_core.h | 51 | ||||
-rwxr-xr-x | Core/regularizers_GPU/TV_FGP/FGP_TV_GPU_core.cu | 561 | ||||
-rwxr-xr-x | Core/regularizers_GPU/TV_FGP/FGP_TV_GPU_core.h | 10 |
4 files changed, 867 insertions, 236 deletions
diff --git a/Core/regularizers_CPU/FGP_TV_core.c b/Core/regularizers_CPU/FGP_TV_core.c index 03cd445..304848d 100644 --- a/Core/regularizers_CPU/FGP_TV_core.c +++ b/Core/regularizers_CPU/FGP_TV_core.c @@ -22,245 +22,314 @@ limitations under the License. /* C-OMP implementation of FGP-TV [1] denoising/regularization model (2D/3D case) * * Input Parameters: - * 1. Noisy image/volume [REQUIRED] - * 2. lambda - regularization parameter [REQUIRED] - * 3. Number of iterations [OPTIONAL parameter] - * 4. eplsilon: tolerance constant [OPTIONAL parameter] - * 5. TV-type: 'iso' or 'l1' [OPTIONAL parameter] + * 1. Noisy image/volume + * 2. lambda - regularization parameter + * 3. Number of iterations + * 4. eplsilon: tolerance constant + * 5. TV-type: methodTV - 'iso' (0) or 'l1' (1) + * 6. nonneg: 'nonnegativity (0 is OFF by default) + * 7. print information: 0 (off) or 1 (on) * * Output: * [1] Filtered/regularized image - * [2] last function value - * - * Example of image denoising: - * figure; - * Im = double(imread('lena_gray_256.tif'))/255; % loading image - * u0 = Im + .05*randn(size(Im)); % adding noise - * u = FGP_TV(single(u0), 0.05, 100, 1e-04); * * This function is based on the Matlab's code and paper by * [1] Amir Beck and Marc Teboulle, "Fast Gradient-Based Algorithms for Constrained Total Variation Image Denoising and Deblurring Problems" - * - * D. Kazantsev, 2016-17 - * */ - -/* 2D-case related Functions */ -/*****************************************************************/ -float Obj_func_CALC2D(float *A, float *D, float *funcvalA, float lambda, int dimX, int dimY) -{ - int i,j; - float f1, f2, val1, val2; - - /*data-related term */ - f1 = 0.0f; - for(i=0; i<dimX*dimY; i++) f1 += pow(D[i] - A[i],2); - - /*TV-related term */ - f2 = 0.0f; - for(i=0; i<dimX; i++) { - for(j=0; j<dimY; j++) { - /* boundary conditions */ - if (i == dimX-1) {val1 = 0.0f;} else {val1 = A[(i+1)*dimY + (j)] - A[(i)*dimY + (j)];} - if (j == dimY-1) {val2 = 0.0f;} else {val2 = A[(i)*dimY + (j+1)] - A[(i)*dimY + (j)];} - f2 += sqrt(pow(val1,2) + pow(val2,2)); - }} - - /* sum of two terms */ - funcvalA[0] = 0.5f*f1 + lambda*f2; - return *funcvalA; + +float FGP_TV_CPU(float *Input, float *Output, float lambda, int iter, float epsil, int methodTV, int nonneg, int printM, int dimX, int dimY, int dimZ) +{ + int ll, j, DimTotal; + float re, re1; + float tk = 1.0f; + float tkp1=1.0f; + int count = 0; + + if (dimZ <= 1) { + /*2D case */ + float *Output_prev=NULL, *P1=NULL, *P2=NULL, *P1_prev=NULL, *P2_prev=NULL, *R1=NULL, *R2=NULL; + DimTotal = dimX*dimY; + + Output_prev = (float *) malloc( DimTotal * sizeof(float) ); + P1 = (float *) malloc( DimTotal * sizeof(float) ); + P2 = (float *) malloc( DimTotal * sizeof(float) ); + P1_prev = (float *) malloc( DimTotal * sizeof(float) ); + P2_prev = (float *) malloc( DimTotal * sizeof(float) ); + R1 = (float *) malloc( DimTotal * sizeof(float) ); + R2 = (float *) malloc( DimTotal * sizeof(float) ); + + /* begin iterations */ + for(ll=0; ll<iter; ll++) { + + /* computing the gradient of the objective function */ + Obj_func2D(Input, Output, R1, R2, lambda, dimX, dimY); + + /* apply nonnegativity */ + if (nonneg == 1) for(j=0; j<dimX*dimY; j++) {if (Output[j] < 0.0f) Output[j] = 0.0f;} + + /*Taking a step towards minus of the gradient*/ + Grad_func2D(P1, P2, Output, R1, R2, lambda, dimX, dimY); + + /* projection step */ + Proj_func2D(P1, P2, methodTV, dimX, dimY); + + /*updating R and t*/ + tkp1 = (1.0f + sqrt(1.0f + 4.0f*tk*tk))*0.5f; + Rupd_func2D(P1, P1_prev, P2, P2_prev, R1, R2, tkp1, tk, dimX, dimY); + + /* check early stopping criteria */ + re = 0.0f; re1 = 0.0f; + for(j=0; j<dimX*dimY; j++) + { + re += pow(Output[j] - Output_prev[j],2); + re1 += pow(Output[j],2); + } + re = sqrt(re)/sqrt(re1); + if (re < epsil) count++; + if (count > 4) break; + + /*storing old values*/ + copyIm(Output, Output_prev, dimX, dimY, 1); + copyIm(P1, P1_prev, dimX, dimY, 1); + copyIm(P2, P2_prev, dimX, dimY, 1); + tk = tkp1; + } + if (printM == 1) printf("FGP-TV iterations stopped at iteration %i \n", ll); + free(Output_prev); free(P1); free(P2); free(P1_prev); free(P2_prev); free(R1); free(R2); + } + else { + /*3D case*/ + float *Output_prev=NULL, *P1=NULL, *P2=NULL, *P3=NULL, *P1_prev=NULL, *P2_prev=NULL, *P3_prev=NULL, *R1=NULL, *R2=NULL, *R3=NULL; + DimTotal = dimX*dimY*dimZ; + + Output_prev = (float *) malloc( DimTotal * sizeof(float) ); + P1 = (float *) malloc( DimTotal * sizeof(float) ); + P2 = (float *) malloc( DimTotal * sizeof(float) ); + P3 = (float *) malloc( DimTotal * sizeof(float) ); + P1_prev = (float *) malloc( DimTotal * sizeof(float) ); + P2_prev = (float *) malloc( DimTotal * sizeof(float) ); + P3_prev = (float *) malloc( DimTotal * sizeof(float) ); + R1 = (float *) malloc( DimTotal * sizeof(float) ); + R2 = (float *) malloc( DimTotal * sizeof(float) ); + R3 = (float *) malloc( DimTotal * sizeof(float) ); + + /* begin iterations */ + for(ll=0; ll<iter; ll++) { + + /* computing the gradient of the objective function */ + Obj_func3D(Input, Output, R1, R2, R3, lambda, dimX, dimY, dimZ); + + /* apply nonnegativity */ + if (nonneg == 1) for(j=0; j<dimX*dimY*dimZ; j++) {if (Output[j] < 0.0f) Output[j] = 0.0f;} + + /*Taking a step towards minus of the gradient*/ + Grad_func3D(P1, P2, P3, Output, R1, R2, R3, lambda, dimX, dimY, dimZ); + + /* projection step */ + Proj_func3D(P1, P2, P3, methodTV, dimX, dimY, dimZ); + + /*updating R and t*/ + tkp1 = (1.0f + sqrt(1.0f + 4.0f*tk*tk))*0.5f; + Rupd_func3D(P1, P1_prev, P2, P2_prev, P3, P3_prev, R1, R2, R3, tkp1, tk, dimX, dimY, dimZ); + + /* calculate norm - stopping rules*/ + re = 0.0f; re1 = 0.0f; + for(j=0; j<dimX*dimY*dimZ; j++) + { + re += pow(Output[j] - Output_prev[j],2); + re1 += pow(Output[j],2); + } + re = sqrt(re)/sqrt(re1); + /* stop if the norm residual is less than the tolerance EPS */ + if (re < epsil) count++; + if (count > 4) break; + + /*storing old values*/ + copyIm(Output, Output_prev, dimX, dimY, dimZ); + copyIm(P1, P1_prev, dimX, dimY, dimZ); + copyIm(P2, P2_prev, dimX, dimY, dimZ); + copyIm(P3, P3_prev, dimX, dimY, dimZ); + tk = tkp1; + } + if (printM == 1) printf("FGP-TV iterations stopped at iteration %i \n", ll); + free(Output_prev); free(P1); free(P2); free(P3); free(P1_prev); free(P2_prev); free(P3_prev); free(R1); free(R2); free(R3); + } + return *Output; } float Obj_func2D(float *A, float *D, float *R1, float *R2, float lambda, int dimX, int dimY) { - float val1, val2; - int i, j; -#pragma omp parallel for shared(A,D,R1,R2) private(i,j,val1,val2) - for (i = 0; i<dimX; i++) { - for (j = 0; j<dimY; j++) { - /* boundary conditions */ - if (i == 0) { val1 = 0.0f; } - else { val1 = R1[(i - 1)*dimY + (j)]; } - if (j == 0) { val2 = 0.0f; } - else { val2 = R2[(i)*dimY + (j - 1)]; } - D[(i)*dimY + (j)] = A[(i)*dimY + (j)] - lambda*(R1[(i)*dimY + (j)] + R2[(i)*dimY + (j)] - val1 - val2); - } - } - return *D; + float val1, val2; + int i,j,index; +#pragma omp parallel for shared(A,D,R1,R2) private(index,i,j,val1,val2) + for(i=0; i<dimX; i++) { + for(j=0; j<dimY; j++) { + index = j*dimX+i; + /* boundary conditions */ + if (i == 0) {val1 = 0.0f;} else {val1 = R1[j*dimX + (i-1)];} + if (j == 0) {val2 = 0.0f;} else {val2 = R2[(j-1)*dimX + i];} + D[index] = A[index] - lambda*(R1[index] + R2[index] - val1 - val2); + }} + return *D; } float Grad_func2D(float *P1, float *P2, float *D, float *R1, float *R2, float lambda, int dimX, int dimY) { - float val1, val2, multip; - int i, j; - multip = (1.0f / (8.0f*lambda)); -#pragma omp parallel for shared(P1,P2,D,R1,R2,multip) private(i,j,val1,val2) - for (i = 0; i<dimX; i++) { - for (j = 0; j<dimY; j++) { - /* boundary conditions */ - if (i == dimX - 1) val1 = 0.0f; else val1 = D[(i)*dimY + (j)] - D[(i + 1)*dimY + (j)]; - if (j == dimY - 1) val2 = 0.0f; else val2 = D[(i)*dimY + (j)] - D[(i)*dimY + (j + 1)]; - P1[(i)*dimY + (j)] = R1[(i)*dimY + (j)] + multip*val1; - P2[(i)*dimY + (j)] = R2[(i)*dimY + (j)] + multip*val2; - } - } - return 1; + float val1, val2, multip; + int i,j,index; + multip = (1.0f/(8.0f*lambda)); +#pragma omp parallel for shared(P1,P2,D,R1,R2,multip) private(index,i,j,val1,val2) + for(i=0; i<dimX; i++) { + for(j=0; j<dimY; j++) { + index = j*dimX+i; + /* boundary conditions */ + if (i == dimX-1) val1 = 0.0f; else val1 = D[index] - D[j*dimX + (i+1)]; + if (j == dimY-1) val2 = 0.0f; else val2 = D[index] - D[(j+1)*dimX + i]; + P1[index] = R1[index] + multip*val1; + P2[index] = R2[index] + multip*val2; + }} + return 1; } float Proj_func2D(float *P1, float *P2, int methTV, int dimX, int dimY) { - float val1, val2, denom; - int i, j; - if (methTV == 0) { - /* isotropic TV*/ -#pragma omp parallel for shared(P1,P2) private(i,j,denom) - for (i = 0; i<dimX; i++) { - for (j = 0; j<dimY; j++) { - denom = pow(P1[(i)*dimY + (j)], 2) + pow(P2[(i)*dimY + (j)], 2); - if (denom > 1) { - P1[(i)*dimY + (j)] = P1[(i)*dimY + (j)] / sqrt(denom); - P2[(i)*dimY + (j)] = P2[(i)*dimY + (j)] / sqrt(denom); - } - } - } - } - else { - /* anisotropic TV*/ -#pragma omp parallel for shared(P1,P2) private(i,j,val1,val2) - for (i = 0; i<dimX; i++) { - for (j = 0; j<dimY; j++) { - val1 = fabs(P1[(i)*dimY + (j)]); - val2 = fabs(P2[(i)*dimY + (j)]); - if (val1 < 1.0f) { val1 = 1.0f; } - if (val2 < 1.0f) { val2 = 1.0f; } - P1[(i)*dimY + (j)] = P1[(i)*dimY + (j)] / val1; - P2[(i)*dimY + (j)] = P2[(i)*dimY + (j)] / val2; - } - } - } - return 1; + float val1, val2, denom, sq_denom; + int i,j,index; + if (methTV == 0) { + /* isotropic TV*/ +#pragma omp parallel for shared(P1,P2) private(index,i,j,denom,sq_denom) + for(i=0; i<dimX; i++) { + for(j=0; j<dimY; j++) { + index = j*dimX+i; + denom = pow(P1[index],2) + pow(P2[index],2); + if (denom > 1.0f) { + sq_denom = 1.0f/sqrt(denom); + P1[index] = P1[index]*sq_denom; + P2[index] = P2[index]*sq_denom; + } + }} + } + else { + /* anisotropic TV*/ +#pragma omp parallel for shared(P1,P2) private(index,i,j,val1,val2) + for(i=0; i<dimX; i++) { + for(j=0; j<dimY; j++) { + index = j*dimX+i; + val1 = fabs(P1[index]); + val2 = fabs(P2[index]); + if (val1 < 1.0f) {val1 = 1.0f;} + if (val2 < 1.0f) {val2 = 1.0f;} + P1[index] = P1[index]/val1; + P2[index] = P2[index]/val2; + }} + } + return 1; } float Rupd_func2D(float *P1, float *P1_old, float *P2, float *P2_old, float *R1, float *R2, float tkp1, float tk, int dimX, int dimY) { - int i, j; - float multip; - multip = ((tk - 1.0f) / tkp1); -#pragma omp parallel for shared(P1,P2,P1_old,P2_old,R1,R2,multip) private(i,j) - for (i = 0; i<dimX; i++) { - for (j = 0; j<dimY; j++) { - R1[(i)*dimY + (j)] = P1[(i)*dimY + (j)] + multip*(P1[(i)*dimY + (j)] - P1_old[(i)*dimY + (j)]); - R2[(i)*dimY + (j)] = P2[(i)*dimY + (j)] + multip*(P2[(i)*dimY + (j)] - P2_old[(i)*dimY + (j)]); - } - } - return 1; + int i,j,index; + float multip; + multip = ((tk-1.0f)/tkp1); +#pragma omp parallel for shared(P1,P2,P1_old,P2_old,R1,R2,multip) private(index,i,j) + for(i=0; i<dimX; i++) { + for(j=0; j<dimY; j++) { + index = j*dimX+i; + R1[index] = P1[index] + multip*(P1[index] - P1_old[index]); + R2[index] = P2[index] + multip*(P2[index] - P2_old[index]); + }} + return 1; } /* 3D-case related Functions */ /*****************************************************************/ -float Obj_func_CALC3D(float *A, float *D, float *funcvalA, float lambda, int dimX, int dimY, int dimZ) -{ - int i,j,k; - float f1, f2, val1, val2, val3; - - /*data-related term */ - f1 = 0.0f; - for(i=0; i<dimX*dimY*dimZ; i++) f1 += pow(D[i] - A[i],2); - - /*TV-related term */ - f2 = 0.0f; +float Obj_func3D(float *A, float *D, float *R1, float *R2, float *R3, float lambda, int dimX, int dimY, int dimZ) +{ + float val1, val2, val3; + int i,j,k,index; +#pragma omp parallel for shared(A,D,R1,R2,R3) private(index,i,j,k,val1,val2,val3) for(i=0; i<dimX; i++) { for(j=0; j<dimY; j++) { for(k=0; k<dimZ; k++) { - /* boundary conditions */ - if (i == dimX-1) {val1 = 0.0f;} else {val1 = A[(dimX*dimY)*k + (i+1)*dimY + (j)] - A[(dimX*dimY)*k + (i)*dimY + (j)];} - if (j == dimY-1) {val2 = 0.0f;} else {val2 = A[(dimX*dimY)*k + (i)*dimY + (j+1)] - A[(dimX*dimY)*k + (i)*dimY + (j)];} - if (k == dimZ-1) {val3 = 0.0f;} else {val3 = A[(dimX*dimY)*(k+1) + (i)*dimY + (j)] - A[(dimX*dimY)*k + (i)*dimY + (j)];} - f2 += sqrt(pow(val1,2) + pow(val2,2) + pow(val3,2)); - }}} - /* sum of two terms */ - funcvalA[0] = 0.5f*f1 + lambda*f2; - return *funcvalA; -} - -float Obj_func3D(float *A, float *D, float *R1, float *R2, float *R3, float lambda, int dimX, int dimY, int dimZ) -{ - float val1, val2, val3; - int i, j, k; -#pragma omp parallel for shared(A,D,R1,R2,R3) private(i,j,k,val1,val2,val3) - for (i = 0; i<dimX; i++) { - for (j = 0; j<dimY; j++) { - for (k = 0; k<dimZ; k++) { - /* boundary conditions */ - if (i == 0) { val1 = 0.0f; } - else { val1 = R1[(dimX*dimY)*k + (i - 1)*dimY + (j)]; } - if (j == 0) { val2 = 0.0f; } - else { val2 = R2[(dimX*dimY)*k + (i)*dimY + (j - 1)]; } - if (k == 0) { val3 = 0.0f; } - else { val3 = R3[(dimX*dimY)*(k - 1) + (i)*dimY + (j)]; } - D[(dimX*dimY)*k + (i)*dimY + (j)] = A[(dimX*dimY)*k + (i)*dimY + (j)] - lambda*(R1[(dimX*dimY)*k + (i)*dimY + (j)] + R2[(dimX*dimY)*k + (i)*dimY + (j)] + R3[(dimX*dimY)*k + (i)*dimY + (j)] - val1 - val2 - val3); - } - } - } - return *D; + index = (dimX*dimY)*k + j*dimX+i; + /* boundary conditions */ + if (i == 0) {val1 = 0.0f;} else {val1 = R1[(dimX*dimY)*k + j*dimX + (i-1)];} + if (j == 0) {val2 = 0.0f;} else {val2 = R2[(dimX*dimY)*k + (j-1)*dimX + i];} + if (k == 0) {val3 = 0.0f;} else {val3 = R3[(dimX*dimY)*(k-1) + j*dimX + i];} + D[index] = A[index] - lambda*(R1[index] + R2[index] + R3[index] - val1 - val2 - val3); + }}} + return *D; } float Grad_func3D(float *P1, float *P2, float *P3, float *D, float *R1, float *R2, float *R3, float lambda, int dimX, int dimY, int dimZ) { - float val1, val2, val3, multip; - int i, j, k; - multip = (1.0f / (8.0f*lambda)); -#pragma omp parallel for shared(P1,P2,P3,D,R1,R2,R3,multip) private(i,j,k,val1,val2,val3) - for (i = 0; i<dimX; i++) { - for (j = 0; j<dimY; j++) { - for (k = 0; k<dimZ; k++) { - /* boundary conditions */ - if (i == dimX - 1) val1 = 0.0f; else val1 = D[(dimX*dimY)*k + (i)*dimY + (j)] - D[(dimX*dimY)*k + (i + 1)*dimY + (j)]; - if (j == dimY - 1) val2 = 0.0f; else val2 = D[(dimX*dimY)*k + (i)*dimY + (j)] - D[(dimX*dimY)*k + (i)*dimY + (j + 1)]; - if (k == dimZ - 1) val3 = 0.0f; else val3 = D[(dimX*dimY)*k + (i)*dimY + (j)] - D[(dimX*dimY)*(k + 1) + (i)*dimY + (j)]; - P1[(dimX*dimY)*k + (i)*dimY + (j)] = R1[(dimX*dimY)*k + (i)*dimY + (j)] + multip*val1; - P2[(dimX*dimY)*k + (i)*dimY + (j)] = R2[(dimX*dimY)*k + (i)*dimY + (j)] + multip*val2; - P3[(dimX*dimY)*k + (i)*dimY + (j)] = R3[(dimX*dimY)*k + (i)*dimY + (j)] + multip*val3; - } - } - } - return 1; + float val1, val2, val3, multip; + int i,j,k, index; + multip = (1.0f/(8.0f*lambda)); +#pragma omp parallel for shared(P1,P2,P3,D,R1,R2,R3,multip) private(index,i,j,k,val1,val2,val3) + for(i=0; i<dimX; i++) { + for(j=0; j<dimY; j++) { + for(k=0; k<dimZ; k++) { + index = (dimX*dimY)*k + j*dimX+i; + /* boundary conditions */ + if (i == dimX-1) val1 = 0.0f; else val1 = D[index] - D[(dimX*dimY)*k + j*dimX + (i+1)]; + if (j == dimY-1) val2 = 0.0f; else val2 = D[index] - D[(dimX*dimY)*k + (j+1)*dimX + i]; + if (k == dimZ-1) val3 = 0.0f; else val3 = D[index] - D[(dimX*dimY)*(k+1) + j*dimX + i]; + P1[index] = R1[index] + multip*val1; + P2[index] = R2[index] + multip*val2; + P3[index] = R3[index] + multip*val3; + }}} + return 1; } -float Proj_func3D(float *P1, float *P2, float *P3, int dimX, int dimY, int dimZ) -{ - float val1, val2, val3; - int i, j, k; -#pragma omp parallel for shared(P1,P2,P3) private(i,j,k,val1,val2,val3) - for (i = 0; i<dimX; i++) { - for (j = 0; j<dimY; j++) { - for (k = 0; k<dimZ; k++) { - val1 = fabs(P1[(dimX*dimY)*k + (i)*dimY + (j)]); - val2 = fabs(P2[(dimX*dimY)*k + (i)*dimY + (j)]); - val3 = fabs(P3[(dimX*dimY)*k + (i)*dimY + (j)]); - if (val1 < 1.0f) { val1 = 1.0f; } - if (val2 < 1.0f) { val2 = 1.0f; } - if (val3 < 1.0f) { val3 = 1.0f; } - - P1[(dimX*dimY)*k + (i)*dimY + (j)] = P1[(dimX*dimY)*k + (i)*dimY + (j)] / val1; - P2[(dimX*dimY)*k + (i)*dimY + (j)] = P2[(dimX*dimY)*k + (i)*dimY + (j)] / val2; - P3[(dimX*dimY)*k + (i)*dimY + (j)] = P3[(dimX*dimY)*k + (i)*dimY + (j)] / val3; - } +float Proj_func3D(float *P1, float *P2, float *P3, int methTV, int dimX, int dimY, int dimZ) +{ + float val1, val2, val3, denom, sq_denom; + int i,j,k, index; + if (methTV == 0) { + /* isotropic TV*/ + #pragma omp parallel for shared(P1,P2,P3) private(index,i,j,k,val1,val2,val3,sq_denom) + for(i=0; i<dimX; i++) { + for(j=0; j<dimY; j++) { + for(k=0; k<dimZ; k++) { + index = (dimX*dimY)*k + j*dimX+i; + denom = pow(P1[index],2) + pow(P2[index],2) + pow(P3[index],2); + if (denom > 1.0f) { + sq_denom = 1.0f/sqrt(denom); + P1[index] = P1[index]*sq_denom; + P2[index] = P2[index]*sq_denom; + P3[index] = P3[index]*sq_denom; + } + }}} + } + else { + /* anisotropic TV*/ +#pragma omp parallel for shared(P1,P2,P3) private(index,i,j,k,val1,val2,val3) + for(i=0; i<dimX; i++) { + for(j=0; j<dimY; j++) { + for(k=0; k<dimZ; k++) { + index = (dimX*dimY)*k + j*dimX+i; + val1 = fabs(P1[index]); + val2 = fabs(P2[index]); + val3 = fabs(P3[index]); + if (val1 < 1.0f) {val1 = 1.0f;} + if (val2 < 1.0f) {val2 = 1.0f;} + if (val3 < 1.0f) {val3 = 1.0f;} + P1[index] = P1[index]/val1; + P2[index] = P2[index]/val2; + P3[index] = P3[index]/val3; + }}} } - } - return 1; + return 1; } float Rupd_func3D(float *P1, float *P1_old, float *P2, float *P2_old, float *P3, float *P3_old, float *R1, float *R2, float *R3, float tkp1, float tk, int dimX, int dimY, int dimZ) { - int i, j, k; - float multip; - multip = ((tk - 1.0f) / tkp1); -#pragma omp parallel for shared(P1,P2,P3,P1_old,P2_old,P3_old,R1,R2,R3,multip) private(i,j,k) - for (i = 0; i<dimX; i++) { - for (j = 0; j<dimY; j++) { - for (k = 0; k<dimZ; k++) { - R1[(dimX*dimY)*k + (i)*dimY + (j)] = P1[(dimX*dimY)*k + (i)*dimY + (j)] + multip*(P1[(dimX*dimY)*k + (i)*dimY + (j)] - P1_old[(dimX*dimY)*k + (i)*dimY + (j)]); - R2[(dimX*dimY)*k + (i)*dimY + (j)] = P2[(dimX*dimY)*k + (i)*dimY + (j)] + multip*(P2[(dimX*dimY)*k + (i)*dimY + (j)] - P2_old[(dimX*dimY)*k + (i)*dimY + (j)]); - R3[(dimX*dimY)*k + (i)*dimY + (j)] = P3[(dimX*dimY)*k + (i)*dimY + (j)] + multip*(P3[(dimX*dimY)*k + (i)*dimY + (j)] - P3_old[(dimX*dimY)*k + (i)*dimY + (j)]); - } - } - } - return 1; + int i,j,k,index; + float multip; + multip = ((tk-1.0f)/tkp1); +#pragma omp parallel for shared(P1,P2,P3,P1_old,P2_old,P3_old,R1,R2,R3,multip) private(index,i,j,k) + for(i=0; i<dimX; i++) { + for(j=0; j<dimY; j++) { + for(k=0; k<dimZ; k++) { + index = (dimX*dimY)*k + j*dimX+i; + R1[index] = P1[index] + multip*(P1[index] - P1_old[index]); + R2[index] = P2[index] + multip*(P2[index] - P2_old[index]); + R3[index] = P3[index] + multip*(P3[index] - P3_old[index]); + }}} + return 1; } - - diff --git a/Core/regularizers_CPU/FGP_TV_core.h b/Core/regularizers_CPU/FGP_TV_core.h index 10ea224..b591819 100644 --- a/Core/regularizers_CPU/FGP_TV_core.h +++ b/Core/regularizers_CPU/FGP_TV_core.h @@ -27,46 +27,37 @@ limitations under the License. #include "CCPiDefines.h" /* C-OMP implementation of FGP-TV [1] denoising/regularization model (2D/3D case) -* -* Input Parameters: -* 1. Noisy image/volume [REQUIRED] -* 2. lambda - regularization parameter [REQUIRED] -* 3. Number of iterations [OPTIONAL parameter] -* 4. eplsilon: tolerance constant [OPTIONAL parameter] -* 5. TV-type: 'iso' or 'l1' [OPTIONAL parameter] -* -* Output: -* [1] Filtered/regularized image -* [2] last function value -* -* Example of image denoising: -* figure; -* Im = double(imread('lena_gray_256.tif'))/255; % loading image -* u0 = Im + .05*randn(size(Im)); % adding noise -* u = FGP_TV(single(u0), 0.05, 100, 1e-04); -* -* to compile with OMP support: mex FGP_TV.c CFLAGS="\$CFLAGS -fopenmp -Wall -std=c99" LDFLAGS="\$LDFLAGS -fopenmp" -* This function is based on the Matlab's code and paper by -* [1] Amir Beck and Marc Teboulle, "Fast Gradient-Based Algorithms for Constrained Total Variation Image Denoising and Deblurring Problems" -* -* D. Kazantsev, 2016-17 -* -*/ + * + * Input Parameters: + * 1. Noisy image/volume + * 2. lambda - regularization parameter + * 3. Number of iterations + * 4. eplsilon: tolerance constant + * 5. TV-type: methodTV - 'iso' (0) or 'l1' (1) + * 6. nonneg: 'nonnegativity (0 is OFF by default) + * 7. print information: 0 (off) or 1 (on) + * + * Output: + * [1] Filtered/regularized image + * + * This function is based on the Matlab's code and paper by + * [1] Amir Beck and Marc Teboulle, "Fast Gradient-Based Algorithms for Constrained Total Variation Image Denoising and Deblurring Problems" + */ + #ifdef __cplusplus extern "C" { #endif -//float copyIm(float *A, float *B, int dimX, int dimY, int dimZ); +CCPI_EXPORT float FGP_TV_CPU(float *Input, float *Output, float lambda, int iter, float epsil, int methodTV, int nonneg, int printM, int dimX, int dimY, int dimZ); + CCPI_EXPORT float Obj_func2D(float *A, float *D, float *R1, float *R2, float lambda, int dimX, int dimY); CCPI_EXPORT float Grad_func2D(float *P1, float *P2, float *D, float *R1, float *R2, float lambda, int dimX, int dimY); CCPI_EXPORT float Proj_func2D(float *P1, float *P2, int methTV, int dimX, int dimY); CCPI_EXPORT float Rupd_func2D(float *P1, float *P1_old, float *P2, float *P2_old, float *R1, float *R2, float tkp1, float tk, int dimX, int dimY); -CCPI_EXPORT float Obj_func_CALC2D(float *A, float *D, float *funcvalA, float lambda, int dimX, int dimY); CCPI_EXPORT float Obj_func3D(float *A, float *D, float *R1, float *R2, float *R3, float lambda, int dimX, int dimY, int dimZ); CCPI_EXPORT float Grad_func3D(float *P1, float *P2, float *P3, float *D, float *R1, float *R2, float *R3, float lambda, int dimX, int dimY, int dimZ); -CCPI_EXPORT float Proj_func3D(float *P1, float *P2, float *P3, int dimX, int dimY, int dimZ); +CCPI_EXPORT float Proj_func3D(float *P1, float *P2, float *P3, int methTV, int dimX, int dimY, int dimZ); CCPI_EXPORT float Rupd_func3D(float *P1, float *P1_old, float *P2, float *P2_old, float *P3, float *P3_old, float *R1, float *R2, float *R3, float tkp1, float tk, int dimX, int dimY, int dimZ); -CCPI_EXPORT float Obj_func_CALC3D(float *A, float *D, float *funcvalA, float lambda, int dimX, int dimY, int dimZ); #ifdef __cplusplus } -#endif
\ No newline at end of file +#endif diff --git a/Core/regularizers_GPU/TV_FGP/FGP_TV_GPU_core.cu b/Core/regularizers_GPU/TV_FGP/FGP_TV_GPU_core.cu new file mode 100755 index 0000000..21a95c9 --- /dev/null +++ b/Core/regularizers_GPU/TV_FGP/FGP_TV_GPU_core.cu @@ -0,0 +1,561 @@ + /* +This work is part of the Core Imaging Library developed by +Visual Analytics and Imaging System Group of the Science Technology +Facilities Council, STFC + +Copyright 2017 Daniil Kazantsev +Copyright 2017 Srikanth Nagella, Edoardo Pasca + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at +http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include "FGP_TV_GPU_core.h" +#include <thrust/device_vector.h> +#include <thrust/transform_reduce.h> + +/* CUDA implementation of FGP-TV [1] denoising/regularization model (2D/3D case) + * + * Input Parameters: + * 1. Noisy image/volume + * 2. lambda - regularization parameter + * 3. Number of iterations + * 4. eplsilon: tolerance constant + * 5. TV-type: methodTV - 'iso' (0) or 'l1' (1) + * 6. nonneg: 'nonnegativity (0 is OFF by default) + * 7. print information: 0 (off) or 1 (on) + * + * Output: + * [1] Filtered/regularized image + * + * This function is based on the Matlab's code and paper by + * [1] Amir Beck and Marc Teboulle, "Fast Gradient-Based Algorithms for Constrained Total Variation Image Denoising and Deblurring Problems" + */ + +// This will output the proper CUDA error strings in the event that a CUDA host call returns an error +#define checkCudaErrors(err) __checkCudaErrors (err, __FILE__, __LINE__) + +inline void __checkCudaErrors(cudaError err, const char *file, const int line) +{ + if (cudaSuccess != err) + { + fprintf(stderr, "%s(%i) : CUDA Runtime API error %d: %s.\n", + file, line, (int)err, cudaGetErrorString(err)); + exit(EXIT_FAILURE); + } +} + +#define BLKXSIZE2D 16 +#define BLKYSIZE2D 16 + +#define BLKXSIZE 8 +#define BLKYSIZE 8 +#define BLKZSIZE 8 + +#define idivup(a, b) ( ((a)%(b) != 0) ? (a)/(b)+1 : (a)/(b) ) +struct square { __host__ __device__ float operator()(float x) { return x * x; } }; + +/************************************************/ +/*****************2D modules*********************/ +/************************************************/ +__global__ void Obj_func2D_kernel(float *Ad, float *D, float *R1, float *R2, int N, int M, int ImSize, float lambda) +{ + + float val1,val2; + + //calculate each thread global index + const int xIndex=blockIdx.x*blockDim.x+threadIdx.x; + const int yIndex=blockIdx.y*blockDim.y+threadIdx.y; + + int index = xIndex + N*yIndex; + + if ((xIndex < N) && (yIndex < M)) { + if (xIndex <= 0) {val1 = 0.0f;} else {val1 = R1[(xIndex-1) + N*yIndex];} + if (yIndex <= 0) {val2 = 0.0f;} else {val2 = R2[xIndex + N*(yIndex-1)];} + //Write final result to global memory + D[index] = Ad[index] - lambda*(R1[index] + R2[index] - val1 - val2); + } + return; +} + +__global__ void Grad_func2D_kernel(float *P1, float *P2, float *D, float *R1, float *R2, int N, int M, int ImSize, float multip) +{ + + float val1,val2; + + //calculate each thread global index + const int xIndex=blockIdx.x*blockDim.x+threadIdx.x; + const int yIndex=blockIdx.y*blockDim.y+threadIdx.y; + + int index = xIndex + N*yIndex; + + if ((xIndex < N) && (yIndex < M)) { + + /* boundary conditions */ + if (xIndex >= N-1) val1 = 0.0f; else val1 = D[index] - D[(xIndex+1) + N*yIndex]; + if (yIndex >= M-1) val2 = 0.0f; else val2 = D[index] - D[(xIndex) + N*(yIndex + 1)]; + + //Write final result to global memory + P1[index] = R1[index] + multip*val1; + P2[index] = R2[index] + multip*val2; + } + return; +} + +__global__ void Proj_func2D_iso_kernel(float *P1, float *P2, int N, int M, int ImSize) +{ + + float denom; + //calculate each thread global index + const int xIndex=blockIdx.x*blockDim.x+threadIdx.x; + const int yIndex=blockIdx.y*blockDim.y+threadIdx.y; + + int index = xIndex + N*yIndex; + + if ((xIndex < N) && (yIndex < M)) { + denom = pow(P1[index],2) + pow(P2[index],2); + if (denom > 1.0f) { + P1[index] = P1[index]/sqrt(denom); + P2[index] = P2[index]/sqrt(denom); + } + } + return; +} +__global__ void Proj_func2D_aniso_kernel(float *P1, float *P2, int N, int M, int ImSize) +{ + + float val1, val2; + //calculate each thread global index + const int xIndex=blockIdx.x*blockDim.x+threadIdx.x; + const int yIndex=blockIdx.y*blockDim.y+threadIdx.y; + + int index = xIndex + N*yIndex; + + if ((xIndex < N) && (yIndex < M)) { + val1 = abs(P1[index]); + val2 = abs(P2[index]); + if (val1 < 1.0f) {val1 = 1.0f;} + if (val2 < 1.0f) {val2 = 1.0f;} + P1[index] = P1[index]/val1; + P2[index] = P2[index]/val2; + } + return; +} +__global__ void Rupd_func2D_kernel(float *P1, float *P1_old, float *P2, float *P2_old, float *R1, float *R2, float tkp1, float tk, float multip2, int N, int M, int ImSize) +{ + //calculate each thread global index + const int xIndex=blockIdx.x*blockDim.x+threadIdx.x; + const int yIndex=blockIdx.y*blockDim.y+threadIdx.y; + + int index = xIndex + N*yIndex; + + if ((xIndex < N) && (yIndex < M)) { + R1[index] = P1[index] + multip2*(P1[index] - P1_old[index]); + R2[index] = P2[index] + multip2*(P2[index] - P2_old[index]); + } + return; +} +__global__ void nonneg2D_kernel(float* Output, int N, int M, int num_total) +{ + int xIndex = blockDim.x * blockIdx.x + threadIdx.x; + int yIndex = blockDim.y * blockIdx.y + threadIdx.y; + + int index = xIndex + N*yIndex; + + if (index < num_total) { + if (Output[index] < 0.0f) Output[index] = 0.0f; + } +} +__global__ void copy_kernel2D(float *Input, float* Output, int N, int M, int num_total) +{ + int xIndex = blockDim.x * blockIdx.x + threadIdx.x; + int yIndex = blockDim.y * blockIdx.y + threadIdx.y; + + int index = xIndex + N*yIndex; + + if (index < num_total) { + Output[index] = Input[index]; + } +} +__global__ void ResidCalc2D_kernel(float *Input1, float *Input2, float* Output, int N, int M, int num_total) +{ + int xIndex = blockDim.x * blockIdx.x + threadIdx.x; + int yIndex = blockDim.y * blockIdx.y + threadIdx.y; + + int index = xIndex + N*yIndex; + + if (index < num_total) { + Output[index] = Input1[index] - Input2[index]; + } +} +/************************************************/ +/*****************3D modules*********************/ +/************************************************/ +__global__ void Obj_func3D_kernel(float *Ad, float *D, float *R1, float *R2, float *R3, int N, int M, int Z, int ImSize, float lambda) +{ + + float val1,val2,val3; + + //calculate each thread global index + int i = blockDim.x * blockIdx.x + threadIdx.x; + int j = blockDim.y * blockIdx.y + threadIdx.y; + int k = blockDim.z * blockIdx.z + threadIdx.z; + + int index = (N*M)*k + i + N*j; + + if ((i < N) && (j < M) && (k < Z)) { + if (i <= 0) {val1 = 0.0f;} else {val1 = R1[(N*M)*(k) + (i-1) + N*j];} + if (j <= 0) {val2 = 0.0f;} else {val2 = R2[(N*M)*(k) + i + N*(j-1)];} + if (k <= 0) {val3 = 0.0f;} else {val3 = R3[(N*M)*(k-1) + i + N*j];} + //Write final result to global memory + D[index] = Ad[index] - lambda*(R1[index] + R2[index] + R3[index] - val1 - val2 - val3); + } + return; +} + +__global__ void Grad_func3D_kernel(float *P1, float *P2, float *P3, float *D, float *R1, float *R2, float *R3, int N, int M, int Z, int ImSize, float multip) +{ + + float val1,val2,val3; + + //calculate each thread global index + int i = blockDim.x * blockIdx.x + threadIdx.x; + int j = blockDim.y * blockIdx.y + threadIdx.y; + int k = blockDim.z * blockIdx.z + threadIdx.z; + + int index = (N*M)*k + i + N*j; + + if ((i < N) && (j < M) && (k < Z)) { + /* boundary conditions */ + if (i >= N-1) val1 = 0.0f; else val1 = D[index] - D[(N*M)*(k) + (i+1) + N*j]; + if (j >= M-1) val2 = 0.0f; else val2 = D[index] - D[(N*M)*(k) + i + N*(j+1)]; + if (k >= Z-1) val3 = 0.0f; else val3 = D[index] - D[(N*M)*(k+1) + i + N*j]; + + //Write final result to global memory + P1[index] = R1[index] + multip*val1; + P2[index] = R2[index] + multip*val2; + P3[index] = R3[index] + multip*val3; + } + return; +} + +__global__ void Proj_func3D_iso_kernel(float *P1, float *P2, float *P3, int N, int M, int Z, int ImSize) +{ + + float denom,sq_denom; + //calculate each thread global index + int i = blockDim.x * blockIdx.x + threadIdx.x; + int j = blockDim.y * blockIdx.y + threadIdx.y; + int k = blockDim.z * blockIdx.z + threadIdx.z; + + int index = (N*M)*k + i + N*j; + + if ((i < N) && (j < M) && (k < Z)) { + denom = pow(P1[index],2) + pow(P2[index],2) + pow(P3[index],2); + + if (denom > 1.0f) { + sq_denom = 1.0f/sqrt(denom); + P1[index] = P1[index]*sq_denom; + P2[index] = P2[index]*sq_denom; + P3[index] = P3[index]*sq_denom; + } + } + return; +} + +__global__ void Proj_func3D_aniso_kernel(float *P1, float *P2, float *P3, int N, int M, int Z, int ImSize) +{ + + float val1, val2, val3; + //calculate each thread global index + int i = blockDim.x * blockIdx.x + threadIdx.x; + int j = blockDim.y * blockIdx.y + threadIdx.y; + int k = blockDim.z * blockIdx.z + threadIdx.z; + + int index = (N*M)*k + i + N*j; + + if ((i < N) && (j < M) && (k < Z)) { + val1 = abs(P1[index]); + val2 = abs(P2[index]); + val3 = abs(P3[index]); + if (val1 < 1.0f) {val1 = 1.0f;} + if (val2 < 1.0f) {val2 = 1.0f;} + if (val3 < 1.0f) {val3 = 1.0f;} + P1[index] = P1[index]/val1; + P2[index] = P2[index]/val2; + P3[index] = P3[index]/val3; + } + return; +} + + +__global__ void Rupd_func3D_kernel(float *P1, float *P1_old, float *P2, float *P2_old, float *P3, float *P3_old, float *R1, float *R2, float *R3, float tkp1, float tk, float multip2, int N, int M, int Z, int ImSize) +{ + //calculate each thread global index + int i = blockDim.x * blockIdx.x + threadIdx.x; + int j = blockDim.y * blockIdx.y + threadIdx.y; + int k = blockDim.z * blockIdx.z + threadIdx.z; + + int index = (N*M)*k + i + N*j; + + if ((i < N) && (j < M) && (k < Z)) { + R1[index] = P1[index] + multip2*(P1[index] - P1_old[index]); + R2[index] = P2[index] + multip2*(P2[index] - P2_old[index]); + R3[index] = P3[index] + multip2*(P3[index] - P3_old[index]); + } + return; +} + +__global__ void nonneg3D_kernel(float* Output, int N, int M, int Z, int num_total) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + int j = blockDim.y * blockIdx.y + threadIdx.y; + int k = blockDim.z * blockIdx.z + threadIdx.z; + + int index = (N*M)*k + i + N*j; + + if (index < num_total) { + if (Output[index] < 0.0f) Output[index] = 0.0f; + } +} + +__global__ void copy_kernel3D(float *Input, float* Output, int N, int M, int Z, int num_total) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + int j = blockDim.y * blockIdx.y + threadIdx.y; + int k = blockDim.z * blockIdx.z + threadIdx.z; + + int index = (N*M)*k + i + N*j; + + if (index < num_total) { + Output[index] = Input[index]; + } +} +/*%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*/ + +////////////MAIN HOST FUNCTION /////////////// +extern "C" void FGP_TV_GPU(float *Input, float *Output, float lambda, int iter, float epsil, int methodTV, int nonneg, int printM, int dimX, int dimY, int dimZ) +{ + int deviceCount = -1; // number of devices + cudaGetDeviceCount(&deviceCount); + if (deviceCount == 0) { + fprintf(stderr, "No CUDA devices found\n"); + return; + } + + int count = 0, i; + float re, multip,multip2; + float tk = 1.0f; + float tkp1=1.0f; + + if (dimZ <= 1) { + /*2D verson*/ + int ImSize = dimX*dimY; + float *d_input, *d_update=NULL, *d_update_prev=NULL, *P1=NULL, *P2=NULL, *P1_prev=NULL, *P2_prev=NULL, *R1=NULL, *R2=NULL; + + dim3 dimBlock(BLKXSIZE2D,BLKYSIZE2D); + dim3 dimGrid(idivup(dimX,BLKXSIZE2D), idivup(dimY,BLKYSIZE2D)); + + /*allocate space for images on device*/ + checkCudaErrors( cudaMalloc((void**)&d_input,ImSize*sizeof(float)) ); + checkCudaErrors( cudaMalloc((void**)&d_update,ImSize*sizeof(float)) ); + if (epsil != 0.0f) checkCudaErrors( cudaMalloc((void**)&d_update_prev,ImSize*sizeof(float)) ); + checkCudaErrors( cudaMalloc((void**)&P1,ImSize*sizeof(float)) ); + checkCudaErrors( cudaMalloc((void**)&P2,ImSize*sizeof(float)) ); + checkCudaErrors( cudaMalloc((void**)&P1_prev,ImSize*sizeof(float)) ); + checkCudaErrors( cudaMalloc((void**)&P2_prev,ImSize*sizeof(float)) ); + checkCudaErrors( cudaMalloc((void**)&R1,ImSize*sizeof(float)) ); + checkCudaErrors( cudaMalloc((void**)&R2,ImSize*sizeof(float)) ); + + checkCudaErrors( cudaMemcpy(d_input,Input,ImSize*sizeof(float),cudaMemcpyHostToDevice)); + cudaMemset(P1, 0, ImSize*sizeof(float)); + cudaMemset(P2, 0, ImSize*sizeof(float)); + cudaMemset(P1_prev, 0, ImSize*sizeof(float)); + cudaMemset(P2_prev, 0, ImSize*sizeof(float)); + cudaMemset(R1, 0, ImSize*sizeof(float)); + cudaMemset(R2, 0, ImSize*sizeof(float)); + + /********************** Run CUDA 2D kernel here ********************/ + multip = (1.0f/(8.0f*lambda)); + + /* The main kernel */ + for (i = 0; i < iter; i++) { + + /* computing the gradient of the objective function */ + Obj_func2D_kernel<<<dimGrid,dimBlock>>>(d_input, d_update, R1, R2, dimX, dimY, ImSize, lambda); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + + if (nonneg != 0) { + nonneg2D_kernel<<<dimGrid,dimBlock>>>(d_update, dimX, dimY, ImSize); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); } + + /*Taking a step towards minus of the gradient*/ + Grad_func2D_kernel<<<dimGrid,dimBlock>>>(P1, P2, d_update, R1, R2, dimX, dimY, ImSize, multip); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + + /* projection step */ + if (methodTV == 0) Proj_func2D_iso_kernel<<<dimGrid,dimBlock>>>(P1, P2, dimX, dimY, ImSize); /*isotropic TV*/ + else Proj_func2D_aniso_kernel<<<dimGrid,dimBlock>>>(P1, P2, dimX, dimY, ImSize); /*anisotropic TV*/ + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + + tkp1 = (1.0f + sqrt(1.0f + 4.0f*tk*tk))*0.5f; + multip2 = ((tk-1.0f)/tkp1); + + Rupd_func2D_kernel<<<dimGrid,dimBlock>>>(P1, P1_prev, P2, P2_prev, R1, R2, tkp1, tk, multip2, dimX, dimY, ImSize); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + + if (epsil != 0.0f) { + /* calculate norm - stopping rules using the Thrust library */ + ResidCalc2D_kernel<<<dimGrid,dimBlock>>>(d_update, d_update_prev, P1_prev, dimX, dimY, ImSize); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + + thrust::device_vector<float> d_vec(P1_prev, P1_prev + ImSize); + float reduction = sqrt(thrust::transform_reduce(d_vec.begin(), d_vec.end(), square(), 0.0f, thrust::plus<float>())); + thrust::device_vector<float> d_vec2(d_update, d_update + ImSize); + float reduction2 = sqrt(thrust::transform_reduce(d_vec2.begin(), d_vec2.end(), square(), 0.0f, thrust::plus<float>())); + + re = (reduction/reduction2); + if (re < epsil) count++; + if (count > 4) break; + + copy_kernel2D<<<dimGrid,dimBlock>>>(d_update, d_update_prev, dimX, dimY, ImSize); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + } + + copy_kernel2D<<<dimGrid,dimBlock>>>(P1, P1_prev, dimX, dimY, ImSize); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + + copy_kernel2D<<<dimGrid,dimBlock>>>(P2, P2_prev, dimX, dimY, ImSize); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + + tk = tkp1; + } + if (printM == 1) printf("FGP-TV iterations stopped at iteration %i \n", i); + /***************************************************************/ + //copy result matrix from device to host memory + cudaMemcpy(Output,d_update,ImSize*sizeof(float),cudaMemcpyDeviceToHost); + + cudaFree(d_input); + cudaFree(d_update); + if (epsil != 0.0f) cudaFree(d_update_prev); + cudaFree(P1); + cudaFree(P2); + cudaFree(P1_prev); + cudaFree(P2_prev); + cudaFree(R1); + cudaFree(R2); + } + else { + /*3D verson*/ + int ImSize = dimX*dimY*dimZ; + float *d_input, *d_update=NULL, *P1=NULL, *P2=NULL, *P3=NULL, *P1_prev=NULL, *P2_prev=NULL, *P3_prev=NULL, *R1=NULL, *R2=NULL, *R3=NULL; + + dim3 dimBlock(BLKXSIZE,BLKYSIZE,BLKZSIZE); + dim3 dimGrid(idivup(dimX,BLKXSIZE), idivup(dimY,BLKYSIZE),idivup(dimZ,BLKZSIZE)); + + /*allocate space for images on device*/ + checkCudaErrors( cudaMalloc((void**)&d_input,ImSize*sizeof(float)) ); + checkCudaErrors( cudaMalloc((void**)&d_update,ImSize*sizeof(float)) ); + checkCudaErrors( cudaMalloc((void**)&P1,ImSize*sizeof(float)) ); + checkCudaErrors( cudaMalloc((void**)&P2,ImSize*sizeof(float)) ); + checkCudaErrors( cudaMalloc((void**)&P3,ImSize*sizeof(float)) ); + checkCudaErrors( cudaMalloc((void**)&P1_prev,ImSize*sizeof(float)) ); + checkCudaErrors( cudaMalloc((void**)&P2_prev,ImSize*sizeof(float)) ); + checkCudaErrors( cudaMalloc((void**)&P3_prev,ImSize*sizeof(float)) ); + checkCudaErrors( cudaMalloc((void**)&R1,ImSize*sizeof(float)) ); + checkCudaErrors( cudaMalloc((void**)&R2,ImSize*sizeof(float)) ); + checkCudaErrors( cudaMalloc((void**)&R3,ImSize*sizeof(float)) ); + + checkCudaErrors( cudaMemcpy(d_input,Input,ImSize*sizeof(float),cudaMemcpyHostToDevice)); + cudaMemset(P1, 0, ImSize*sizeof(float)); + cudaMemset(P2, 0, ImSize*sizeof(float)); + cudaMemset(P3, 0, ImSize*sizeof(float)); + cudaMemset(P1_prev, 0, ImSize*sizeof(float)); + cudaMemset(P2_prev, 0, ImSize*sizeof(float)); + cudaMemset(P3_prev, 0, ImSize*sizeof(float)); + cudaMemset(R1, 0, ImSize*sizeof(float)); + cudaMemset(R2, 0, ImSize*sizeof(float)); + cudaMemset(R3, 0, ImSize*sizeof(float)); + /********************** Run CUDA 3D kernel here ********************/ + multip = (1.0f/(8.0f*lambda)); + + /* The main kernel */ + for (i = 0; i < iter; i++) { + + /* computing the gradient of the objective function */ + Obj_func3D_kernel<<<dimGrid,dimBlock>>>(d_input, d_update, R1, R2, R3, dimX, dimY, dimZ, ImSize, lambda); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + + if (nonneg != 0) { + nonneg3D_kernel<<<dimGrid,dimBlock>>>(d_update, dimX, dimY, dimZ, ImSize); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); } + + /*Taking a step towards minus of the gradient*/ + Grad_func3D_kernel<<<dimGrid,dimBlock>>>(P1, P2, P3, d_update, R1, R2, R3, dimX, dimY, dimZ, ImSize, multip); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + + /* projection step */ + if (methodTV == 0) Proj_func3D_iso_kernel<<<dimGrid,dimBlock>>>(P1, P2, P3, dimX, dimY, dimZ, ImSize); /* isotropic kernel */ + else Proj_func3D_aniso_kernel<<<dimGrid,dimBlock>>>(P1, P2, P3, dimX, dimY, dimZ, ImSize); /* anisotropic kernel */ + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + + tkp1 = (1.0f + sqrt(1.0f + 4.0f*tk*tk))*0.5f; + multip2 = ((tk-1.0f)/tkp1); + + Rupd_func3D_kernel<<<dimGrid,dimBlock>>>(P1, P1_prev, P2, P2_prev, P3, P3_prev, R1, R2, R3, tkp1, tk, multip2, dimX, dimY, dimZ, ImSize); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + + copy_kernel3D<<<dimGrid,dimBlock>>>(P1, P1_prev, dimX, dimY, dimZ, ImSize); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + + copy_kernel3D<<<dimGrid,dimBlock>>>(P2, P2_prev, dimX, dimY, dimZ, ImSize); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + + copy_kernel3D<<<dimGrid,dimBlock>>>(P3, P3_prev, dimX, dimY, dimZ, ImSize); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + + tk = tkp1; + } + if (printM == 1) printf("FGP-TV iterations stopped at iteration %i \n", i); + /***************************************************************/ + //copy result matrix from device to host memory + cudaMemcpy(Output,d_update,ImSize*sizeof(float),cudaMemcpyDeviceToHost); + + cudaFree(d_input); + cudaFree(d_update); + cudaFree(P1); + cudaFree(P2); + cudaFree(P3); + cudaFree(P1_prev); + cudaFree(P2_prev); + cudaFree(P3_prev); + cudaFree(R1); + cudaFree(R2); + cudaFree(R3); + } + cudaDeviceReset(); +} diff --git a/Core/regularizers_GPU/TV_FGP/FGP_TV_GPU_core.h b/Core/regularizers_GPU/TV_FGP/FGP_TV_GPU_core.h new file mode 100755 index 0000000..a5d3f73 --- /dev/null +++ b/Core/regularizers_GPU/TV_FGP/FGP_TV_GPU_core.h @@ -0,0 +1,10 @@ +#include <stdio.h> +#include <stdlib.h> +#include <memory.h> + +#ifndef _FGP_TV_GPU_ +#define _FGP_TV_GPU_ + +extern "C" void FGP_TV_GPU(float *Input, float *Output, float lambda, int iter, float epsil, int methodTV, int nonneg, int printM, int dimX, int dimY, int dimZ); + +#endif |