diff options
| author | Suren A. Chilingaryan <csa@suren.me> | 2020-03-29 23:13:45 +0200 | 
|---|---|---|
| committer | Suren A. Chilingaryan <csa@suren.me> | 2020-03-29 23:13:45 +0200 | 
| commit | cd0dd76a48aada43c3066642a8eb327a0374d113 (patch) | |
| tree | d5ce8cebda9511c5ac6280df54c29e8b631fa1cb /src | |
| parent | febfe9a6490052d4b8789fd8f7a0342115bfd55e (diff) | |
| download | regularization-cd0dd76a48aada43c3066642a8eb327a0374d113.tar.gz regularization-cd0dd76a48aada43c3066642a8eb327a0374d113.tar.bz2 regularization-cd0dd76a48aada43c3066642a8eb327a0374d113.tar.xz regularization-cd0dd76a48aada43c3066642a8eb327a0374d113.zip | |
Add optimization steps for reference
Diffstat (limited to 'src')
| -rw-r--r-- | src/Core/performance_CPU/README | 25 | ||||
| -rwxr-xr-x | src/Core/performance_CPU/TNV_core.c.v15 | 731 | ||||
| -rwxr-xr-x | src/Core/performance_CPU/TNV_core.c.v17 | 690 | ||||
| -rwxr-xr-x | src/Core/performance_CPU/TNV_core.c.v18 | 688 | ||||
| -rwxr-xr-x | src/Core/performance_CPU/TNV_core.c.v19 | 681 | ||||
| -rwxr-xr-x | src/Core/performance_CPU/TNV_core.c.v27 | 650 | ||||
| -rwxr-xr-x | src/Core/performance_CPU/TNV_core.c.v32 | 676 | ||||
| -rwxr-xr-x | src/Core/performance_CPU/TNV_core.c.v4.stdver | 629 | ||||
| -rw-r--r-- | src/Core/performance_CPU/TNV_core_backtrack_loop.h.v19 | 100 | ||||
| -rw-r--r-- | src/Core/performance_CPU/TNV_core_loop.h.v32 | 119 | 
10 files changed, 4989 insertions, 0 deletions
| diff --git a/src/Core/performance_CPU/README b/src/Core/performance_CPU/README new file mode 100644 index 0000000..948c544 --- /dev/null +++ b/src/Core/performance_CPU/README @@ -0,0 +1,25 @@ +TNV_core.c.v4.stdver: Fully equivalent (signle-threads). There is two potential breakage points.  +    - If OpenMP is enabled, the acces to div_upd will not be serialized and results will breaj +    - The results will slightly differ due to different order of summation if loop summing resprimal/resdual organized in a logical way +TNV_core.c.v15: Multi-threads.  Works correctly only in the single-threaded mode (if TNV_NEW_STYLE is disabled). In multi-threaded there results slightly differ due to changed order of operation +    - TNV_NEW_STYLE slightly disturbs results in both single- and multi-threaded modes +    - Resprimal/resdual are summed in groups (not sequentially) if multiple threads. But his actually should improve precision. Use TNV_CHECK_RES to check conformance +    - Afterwards, in multi-threaded moded there is a still minor descripancy which first occurs in resprimal (after a few iterations). This is because of changed order of operations while computing +    div_upd (only on the first lines of each new sub-block). Normally, we first compute the vertical and, then, add horizontal. On the border rows, instead we first add horizontals... +    To check, div/div_upd changed to double. There is no difference then. +TNV_core.c.v17: Computationaly comptabile with v15. +    - Padding actually harms performance +    - Intel compiler gives about 10% speed-up +TNV_core.c.v18: Blocking helps to boost performance further but only with Intel Compiler. Gcc/Clang is slightly slower here. +    - Padding here doesn't harm performance, but is not helpful either +    - Difference between icc and gcc is probably due to auto-vectorization. +    - Results slightly changed due to different order of operations +TNV_core.c.v19: Eliminate conditionals in the inner loops to help gcc-autovectorisation +    - Last version implementing full algrotithm with backtrack in the middle of iterations. +    - Again results slightly diverge from v18 due to different order of operations +TNV_core.c.v27: v18 with backtracking only on first iterations (otherwise warning reported) +TNV_core.c.v32: v19 with backtracking only on first iterations (otherwise warning reported) + + +Repo: +    - Padding seems to have effect on the newer AVX2 systems. Re-enabled. diff --git a/src/Core/performance_CPU/TNV_core.c.v15 b/src/Core/performance_CPU/TNV_core.c.v15 new file mode 100755 index 0000000..3161cbf --- /dev/null +++ b/src/Core/performance_CPU/TNV_core.c.v15 @@ -0,0 +1,731 @@ +/* + * This work is part of the Core Imaging Library developed by + * Visual Analytics and Imaging System Group of the Science Technology + * Facilities Council, STFC + * + * Copyright 2017 Daniil Kazantsev + * Copyright 2017 Srikanth Nagella, Edoardo Pasca + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "TNV_core.h" + +#define min(a,b) (((a)<(b))?(a):(b)) + +/*inline*/ void coefF(float *t, float M1, float M2, float M3, float sigma, int p, int q, int r) { +    int ii, num; +    float divsigma = 1.0f / sigma; +    float sum, shrinkfactor; +    float T,D,det,eig1,eig2,sig1,sig2,V1, V2, V3, V4, v0,v1,v2, mu1,mu2,sig1_upd,sig2_upd; +    float proj[2] = {0}; + +    // Compute eigenvalues of M +    T = M1 + M3; +    D = M1 * M3 - M2 * M2; +    det = sqrtf(MAX((T * T / 4.0f) - D, 0.0f)); +    eig1 = MAX((T / 2.0f) + det, 0.0f); +    eig2 = MAX((T / 2.0f) - det, 0.0f); +    sig1 = sqrtf(eig1); +    sig2 = sqrtf(eig2); + +    // Compute normalized eigenvectors +    V1 = V2 = V3 = V4 = 0.0f; + +    if(M2 != 0.0f) +    { +        v0 = M2; +        v1 = eig1 - M3; +        v2 = eig2 - M3; + +        mu1 = sqrtf(v0 * v0 + v1 * v1); +        mu2 = sqrtf(v0 * v0 + v2 * v2); + +        if(mu1 > fTiny) +        { +            V1 = v1 / mu1; +            V3 = v0 / mu1; +        } + +        if(mu2 > fTiny) +        { +            V2 = v2 / mu2; +            V4 = v0 / mu2; +        } + +    } else +    { +        if(M1 > M3) +        { +            V1 = V4 = 1.0f; +            V2 = V3 = 0.0f; + +        } else +        { +            V1 = V4 = 0.0f; +            V2 = V3 = 1.0f; +        } +    } + +    // Compute prox_p of the diagonal entries +    sig1_upd = sig2_upd = 0.0f; + +    if(p == 1) +    { +        sig1_upd = MAX(sig1 - divsigma, 0.0f); +        sig2_upd = MAX(sig2 - divsigma, 0.0f); + +    } else if(p == INFNORM) +    { +        proj[0] = sigma * fabs(sig1); +        proj[1] = sigma * fabs(sig2); + +        /*l1 projection part */ +        sum = fLarge; +        num = 0l; +        shrinkfactor = 0.0f; +        while(sum > 1.0f) +        { +            sum = 0.0f; +            num = 0; + +            for(ii = 0; ii < 2; ii++) +            { +                proj[ii] = MAX(proj[ii] - shrinkfactor, 0.0f); + +                sum += fabs(proj[ii]); +                if(proj[ii]!= 0.0f) +                    num++; +            } + +            if(num > 0) +                shrinkfactor = (sum - 1.0f) / num; +            else +                break; +        } +        /*l1 proj ends*/ + +        sig1_upd = sig1 - divsigma * proj[0]; +        sig2_upd = sig2 - divsigma * proj[1]; +    } + +    // Compute the diagonal entries of $\widehat{\Sigma}\Sigma^{\dagger}_0$ +    if(sig1 > fTiny) +        sig1_upd /= sig1; + +    if(sig2 > fTiny) +        sig2_upd /= sig2; + +    // Compute solution +    t[0] = sig1_upd * V1 * V1 + sig2_upd * V2 * V2; +    t[1] = sig1_upd * V1 * V3 + sig2_upd * V2 * V4; +    t[2] = sig1_upd * V3 * V3 + sig2_upd * V4 * V4; +} + + +#include "hw_sched.h" +typedef struct { +    int offY, stepY, realY, copY; +    float *Input, *u, *u_upd, *qx, *qy, *qx_upd, *qy_upd, *gradx, *grady, *gradx_upd, *grady_upd; +    double *div, *div_upd; +    float resprimal, resdual; +    float unorm, qnorm, product; +} tnv_thread_t; + +typedef struct { +    int threads; +    tnv_thread_t *thr_ctx; +    float *InputT, *uT; +    int dimX, dimY, dimZ, padZ; +    float lambda, sigma, tau, theta; +} tnv_context_t; + +HWSched sched = NULL; +tnv_context_t tnv_ctx; + + +static int tnv_free(HWThread thr, void *hwctx, int device_id, void *data) { +    int i,j,k; +    tnv_context_t *tnv_ctx = (tnv_context_t*)data; +    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + +    free(ctx->Input); +    free(ctx->u); +    free(ctx->u_upd); +    free(ctx->qx); +    free(ctx->qy); +    free(ctx->qx_upd); +    free(ctx->qy_upd); +    free(ctx->gradx); +    free(ctx->grady); +    free(ctx->gradx_upd); +    free(ctx->grady_upd); +    free(ctx->div); +    free(ctx->div_upd); +     +    return 0; +} + +static int tnv_init(HWThread thr, void *hwctx, int device_id, void *data) { +    tnv_context_t *tnv_ctx = (tnv_context_t*)data; +    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; +     +    int dimX = tnv_ctx->dimX; +    int dimY = tnv_ctx->dimY; +    int dimZ = tnv_ctx->dimZ; +    int offY = ctx->offY; +    int stepY = ctx->stepY; +    int realY = ctx->realY; +     +//    printf("%i %p - %i %i %i x %i %i\n", device_id, ctx, dimX, dimY, dimZ, offY, stepY); + +    long DimTotal = (long)(dimX*realY*dimZ); +    long Dim1Total = (long)(dimX*(stepY+1)*dimZ); + +    // Auxiliar vectors +    ctx->Input = calloc(Dim1Total, sizeof(float)); +    ctx->u = calloc(Dim1Total, sizeof(float)); +    ctx->u_upd = calloc(Dim1Total, sizeof(float)); +    ctx->qx = calloc(DimTotal, sizeof(float)); +    ctx->qy = calloc(DimTotal, sizeof(float)); +    ctx->qx_upd = calloc(DimTotal, sizeof(float)); +    ctx->qy_upd = calloc(DimTotal, sizeof(float)); +    ctx->gradx = calloc(DimTotal, sizeof(float)); +    ctx->grady = calloc(DimTotal, sizeof(float)); +    ctx->gradx_upd = calloc(DimTotal, sizeof(float)); +    ctx->grady_upd = calloc(DimTotal, sizeof(float)); +    ctx->div = calloc(Dim1Total, sizeof(double)); +    ctx->div_upd = calloc(Dim1Total, sizeof(double)); + +    if ((!ctx->Input)||(!ctx->u)||(!ctx->u_upd)||(!ctx->qx)||(!ctx->qy)||(!ctx->qx_upd)||(!ctx->qy_upd)||(!ctx->gradx)||(!ctx->grady)||(!ctx->gradx_upd)||(!ctx->grady_upd)||(!ctx->div)||(!ctx->div_upd)) { +        fprintf(stderr, "Error allocating memory\n"); +        exit(-1); +    } + +    return 0; +} + +static int tnv_start(HWThread thr, void *hwctx, int device_id, void *data) { +    int i,j,k; +    tnv_context_t *tnv_ctx = (tnv_context_t*)data; +    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; +     +    int dimX = tnv_ctx->dimX; +    int dimY = tnv_ctx->dimY; +    int dimZ = tnv_ctx->dimZ; +    int offY = ctx->offY; +    int stepY = ctx->stepY; +    int realY = ctx->realY; +    int copY = ctx->copY; +     +//    printf("%i %p - %i %i %i x %i %i\n", device_id, ctx, dimX, dimY, dimZ, offY, stepY); + +    long DimTotal = (long)(dimX*realY*dimZ); +    long Dim1Total = (long)(dimX*(stepY+1)*dimZ); + +    memset(ctx->u, 0, Dim1Total * sizeof(float)); +    memset(ctx->qx, 0, DimTotal * sizeof(float)); +    memset(ctx->qy, 0, DimTotal * sizeof(float)); +    memset(ctx->gradx, 0, DimTotal * sizeof(float)); +    memset(ctx->grady, 0, DimTotal * sizeof(float)); +    memset(ctx->div, 0, Dim1Total * sizeof(double)); +    memset(ctx->u_upd, 0, Dim1Total * sizeof(float)); +    memset(ctx->qx_upd, 0, DimTotal * sizeof(float)); +    memset(ctx->qy_upd, 0, DimTotal * sizeof(float)); +    memset(ctx->gradx_upd, 0, DimTotal * sizeof(float)); +    memset(ctx->grady_upd, 0, DimTotal * sizeof(float)); +    memset(ctx->div_upd, 0, Dim1Total * sizeof(double)); + +    for(k=0; k<dimZ; k++) { +        for(j=0; j<copY; j++) { +            for(i=0; i<dimX; i++) { +                ctx->Input[j * dimX * dimZ + i * dimZ + k] =  tnv_ctx->InputT[k * dimX * dimY + (j + offY) * dimX + i]; +                ctx->u[j * dimX * dimZ + i * dimZ + k] =  tnv_ctx->uT[k * dimX * dimY + (j + offY) * dimX + i]; +            } +        } +    } + +    return 0; +} + +static int tnv_finish(HWThread thr, void *hwctx, int device_id, void *data) { +    int i,j,k; +    tnv_context_t *tnv_ctx = (tnv_context_t*)data; +    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + +    int dimX = tnv_ctx->dimX; +    int dimY = tnv_ctx->dimY; +    int dimZ = tnv_ctx->dimZ; +    int offY = ctx->offY; +    int stepY = ctx->stepY; +    int realY = ctx->realY; +    int copY = ctx->copY; + +    long DimTotal = (long)(dimX*realY*dimZ); +    long Dim1Total = (long)(dimX*(stepY+1)*dimZ); + +    for(k=0; k<dimZ; k++) { +        for(j=0; j<copY; j++) { +            for(i=0; i<dimX; i++) { +                tnv_ctx->uT[k * dimX * dimY + (j + offY) * dimX + i] = ctx->u[j * dimX * dimZ + i * dimZ + k]; +            } +        } +    } +     +    return 0; +} + + +static int tnv_copy(HWThread thr, void *hwctx, int device_id, void *data) { +    int i,j,k; +    tnv_context_t *tnv_ctx = (tnv_context_t*)data; +    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; +     +    int dimX = tnv_ctx->dimX; +    int dimY = tnv_ctx->dimY; +    int dimZ = tnv_ctx->dimZ; +    int stepY = ctx->stepY; +    int realY = ctx->realY; +    long DimTotal = (long)(dimX*realY*dimZ); +    long Dim1Total = (long)(dimX*(stepY+1)*dimZ); + +    // Auxiliar vectors +    memcpy(ctx->u, ctx->u_upd, Dim1Total * sizeof(float)); +    memcpy(ctx->qx, ctx->qx_upd, DimTotal * sizeof(float)); +    memcpy(ctx->qy, ctx->qy_upd, DimTotal * sizeof(float)); +    memcpy(ctx->gradx, ctx->gradx_upd, DimTotal * sizeof(float)); +    memcpy(ctx->grady, ctx->grady_upd, DimTotal * sizeof(float)); +    memcpy(ctx->div, ctx->div_upd, Dim1Total * sizeof(double)); + +    return 0; +} + +static int tnv_restore(HWThread thr, void *hwctx, int device_id, void *data) { +    int i,j,k; +    tnv_context_t *tnv_ctx = (tnv_context_t*)data; +    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; +     +    int dimX = tnv_ctx->dimX; +    int dimY = tnv_ctx->dimY; +    int dimZ = tnv_ctx->dimZ; +    int stepY = ctx->stepY; +    int realY = ctx->realY; +    long DimTotal = (long)(dimX*realY*dimZ); +    long Dim1Total = (long)(dimX*(stepY+1)*dimZ); + +    // Auxiliar vectors +    memcpy(ctx->u_upd, ctx->u, Dim1Total * sizeof(float)); +    memcpy(ctx->qx_upd, ctx->qx, DimTotal * sizeof(float)); +    memcpy(ctx->qy_upd, ctx->qy, DimTotal * sizeof(float)); +    memcpy(ctx->gradx_upd, ctx->gradx, DimTotal * sizeof(float)); +    memcpy(ctx->grady_upd, ctx->grady, DimTotal * sizeof(float)); +    memcpy(ctx->div_upd, ctx->div, Dim1Total * sizeof(double)); + +    return 0; +} + + +static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) { +    long i, j, k, l, m; + +    tnv_context_t *tnv_ctx = (tnv_context_t*)data; +    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; +     +    int dimX = tnv_ctx->dimX; +    int dimY = tnv_ctx->dimY; +    int dimZ = tnv_ctx->dimZ; +    int padZ = tnv_ctx->padZ; +    int offY = ctx->offY; +    int stepY = ctx->stepY; +    int copY = ctx->copY; + +    float *Input = ctx->Input; +    float *u = ctx->u; +    float *u_upd = ctx->u_upd; +    float *qx = ctx->qx; +    float *qy = ctx->qy; +    float *qx_upd = ctx->qx_upd; +    float *qy_upd = ctx->qy_upd; +    float *gradx = ctx->gradx; +    float *grady = ctx->grady; +    float *gradx_upd = ctx->gradx_upd; +    float *grady_upd = ctx->grady_upd; +    double *div = ctx->div; +    double *div_upd = ctx->div_upd; + +    long p = 1l; +    long q = 1l; +    long r = 0l; + +    float lambda = tnv_ctx->lambda; +    float sigma = tnv_ctx->sigma; +    float tau = tnv_ctx->tau; +    float theta = tnv_ctx->theta; +     +    float taulambda = tau * lambda; +    float divtau = 1.0f / tau; +    float divsigma = 1.0f / sigma; +    float theta1 = 1.0f + theta; +    float constant = 1.0f + taulambda; + +    float resprimal = 0.0f; +    float resdual = 0.0f; +    float product = 0.0f; +    float unorm = 0.0f; +    float qnorm = 0.0f; + +    float udiff[dimZ]; +    float qxdiff; +    float qydiff; +    float divdiff; +    float gradxdiff[dimZ]; +    float gradydiff[dimZ]; + +    for(k=0; k < dimZ * dimX; k++) {  +        u_upd[k] = (u[k] + tau * div[k] + taulambda * Input[k])/constant; +        div_upd[k] = 0; +    } + +    for(j = 0; j < stepY; j++) { +/*        m = j * dimX * dimZ + (dimX - 1) * dimZ; +        for(k = 0; k < dimZ; k++) {  +            u_upd[k + m] = (u[k + m] + tau * div[k + m] + taulambda * Input[k + m]) / constant;  +        }*/ +         +        for(i=0; i < (dimX /*- 1*/); i++) { +            l = (j * dimX  + i) * dimZ; +            float t[3]; +            float M1 = 0.0f, M2 = 0.0f, M3 = 0.0f; +            m = dimX * dimZ; +         +//#pragma unroll 64 +            for(k = 0; k < dimZ; k++) { +                u_upd[l + k + m] = (u[l + k + m] + tau * div[l + k + m] + taulambda * Input[l + k + m]) / constant; + +                gradx_upd[l + k] = (i == (dimX - 1))?0:(u_upd[l + k + dimZ] - u_upd[l + k]); +                grady_upd[l + k] = (j == (copY - 1))?0:(u_upd[l + k + dimX * dimZ] - u_upd[l + k]);   // We need div from the next thread on last iter + +                udiff[k] = u[l + k] - u_upd[l + k]; +                unorm += (udiff[k] * udiff[k]); +//                if ((!k)&&(!i)) printf("%i = %f %f, %f %f\n", offY + j, u[l + k], u_upd[l + k], udiff[k], unorm); + +                gradxdiff[k] = gradx[l + k] - gradx_upd[l + k]; +                gradydiff[k] = grady[l + k] - grady_upd[l + k]; + +                float ubarx = theta1 * gradx_upd[l + k] - theta * gradx[l + k]; +                float ubary = theta1 * grady_upd[l + k] - theta * grady[l + k]; +//#define TNV_NEW_STYLE                 +#ifdef TNV_NEW_STYLE                 +                qx_upd[l + k] = qx[l + k] + sigma * ubarx; +                qy_upd[l + k] = qy[l + k] + sigma * ubary; + +                float vx = divsigma * qx_upd[l + k]; //+ ubarx +                float vy = divsigma * qy_upd[l + k]; //+ ubary +#else +                float vx = ubarx + divsigma * qx[l + k]; +                float vy = ubary + divsigma * qy[l + k]; +#endif + +                M1 += (vx * vx); M2 += (vx * vy); M3 += (vy * vy); +            } + +            coefF(t, M1, M2, M3, sigma, p, q, r); +             +//#pragma unroll 64 +            for(k = 0; k < dimZ; k++) { +#ifdef TNV_NEW_STYLE     +                float vx = divsigma * qx_upd[l + k]; +                float vy = divsigma * qy_upd[l + k]; + +                float gx_upd = vx * t[0] + vy * t[1]; +                float gy_upd = vx * t[1] + vy * t[2]; + +                qx_upd[l + k] -= sigma * gx_upd; +                qy_upd[l + k] -= sigma * gy_upd; +#else +                float ubarx = theta1 * gradx_upd[l + k] - theta * gradx[l + k]; +                float ubary = theta1 * grady_upd[l + k] - theta * grady[l + k]; +                float vx = ubarx + divsigma * qx[l + k]; +                float vy = ubary + divsigma * qy[l + k]; + +                float gx_upd = vx * t[0] + vy * t[1]; +                float gy_upd = vx * t[1] + vy * t[2]; + +                qx_upd[l + k] = qx[l + k] + sigma * (ubarx - gx_upd); +                qy_upd[l + k] = qy[l + k] + sigma * (ubary - gy_upd); +#endif + +if(i != (dimX-1)) { +                div_upd[l + k] += qx_upd[l + k]; +                div_upd[l + k + dimZ] -= qx_upd[l + k]; +} +if(j != (copY-1)) { +                div_upd[l + k] += qy_upd[l + k]; +                div_upd[l + k + dimX * dimZ] = -qy_upd[l + k];  // We need to update div in the next thread on last iter +} + +                qxdiff = qx[l + k] - qx_upd[l + k]; +                qydiff = qy[l + k] - qy_upd[l + k]; +                qnorm += (qxdiff * qxdiff + qydiff * qydiff); + +                resdual += fabs(divsigma * qxdiff - gradxdiff[k]); +                resdual += fabs(divsigma * qydiff - gradydiff[k]); +                product += (gradxdiff[k] * qxdiff + gradydiff[k] * qydiff); + +                if ((offY == 0)||(j > 0)) { +                    divdiff = div[l + k] - div_upd[l + k];  // Multiple steps... How we compute without history? +                    resprimal += fabs(divtau * udiff[k] + divdiff);  +                } +            } +             +        } // i +    } +     +     +    ctx->resprimal = resprimal; +    ctx->resdual = resdual; +    ctx->product = product; +    ctx->unorm = unorm; +    ctx->qnorm = qnorm; + +    return 0; +} + +static void TNV_CPU_init(float *InputT, float *uT, int dimX, int dimY, int dimZ) { +    int i, off, size, err; + +    if (sched) return; + +    tnv_ctx.dimX = dimX; +    tnv_ctx.dimY = dimY; +    tnv_ctx.dimZ = dimZ; +    tnv_ctx.padZ = 64 * ((dimZ / 64) + ((dimZ % 64)?1:0)); + +    hw_sched_init(); + +    int threads = hw_sched_get_cpu_count(); +    if (threads > dimY) threads = dimY/2; + +    int step = dimY / threads; +    int extra = dimY % threads; + +    tnv_ctx.threads = threads; +    tnv_ctx.thr_ctx = (tnv_thread_t*)calloc(threads, sizeof(tnv_thread_t)); +    for (i = 0, off = 0; i < threads; i++, off += size) { +        tnv_thread_t *ctx = tnv_ctx.thr_ctx + i; +        size = step + ((i < extra)?1:0); + +        ctx->offY = off; +        ctx->stepY = size; +        ctx->realY = ctx->stepY; + +        if (i == (threads-1)) ctx->copY = ctx->stepY; +        else ctx->copY = ctx->stepY + 1; +    } + +    sched = hw_sched_create(threads); +    if (!sched) { fprintf(stderr, "Error creating threads\n"); exit(-1); } + +    err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_init); +    if (!err) err = hw_sched_wait_task(sched); +    if (err) { fprintf(stderr, "Error %i scheduling init threads", err); exit(-1); } +} + + + +/* + * C-OMP implementation of Total Nuclear Variation regularisation model (2D + channels) [1] + * The code is modified from the implementation by Joan Duran <joan.duran@uib.es> see + * "denoisingPDHG_ipol.cpp" in Joans Collaborative Total Variation package + * + * Input Parameters: + * 1. Noisy volume of 2D + channel dimension, i.e. 3D volume + * 2. lambda - regularisation parameter + * 3. Number of iterations [OPTIONAL parameter] + * 4. eplsilon - tolerance constant [OPTIONAL parameter] + * 5. print information: 0 (off) or 1 (on)  [OPTIONAL parameter] + * + * Output: + * 1. Filtered/regularized image (u) + * + * [1]. Duran, J., Moeller, M., Sbert, C. and Cremers, D., 2016. Collaborative total variation: a general framework for vectorial TV models. SIAM Journal on Imaging Sciences, 9(1), pp.116-151. + */ + +float TNV_CPU_main(float *InputT, float *uT, float lambda, int maxIter, float tol, int dimX, int dimY, int dimZ) +{ +    int err; +    int iter; +    int i,j,k,l; + +    lambda = 1.0f/(2.0f*lambda); +    tnv_ctx.lambda = lambda; + +    // PDHG algorithm parameters +    float tau = 0.5f; +    float sigma = 0.5f; +    float theta = 1.0f; + +    // Backtracking parameters +    float s = 1.0f; +    float gamma = 0.75f; +    float beta = 0.95f; +    float alpha0 = 0.2f; +    float alpha = alpha0; +    float delta = 1.5f; +    float eta = 0.95f; + +    TNV_CPU_init(InputT, uT, dimX, dimY, dimZ); + +    tnv_ctx.InputT = InputT; +    tnv_ctx.uT = uT; + +    err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_start); +    if (!err) err = hw_sched_wait_task(sched); +    if (err) { fprintf(stderr, "Error %i scheduling start threads", err); exit(-1); } + + +    // Apply Primal-Dual Hybrid Gradient scheme +    float residual = fLarge; +    for(iter = 0; iter < maxIter; iter++)   { +        float resprimal = 0.0f; +        float resdual = 0.0f; +        float product = 0.0f; +        float unorm = 0.0f; +        float qnorm = 0.0f; + +        float divtau = 1.0f / tau; + +        tnv_ctx.sigma = sigma; +        tnv_ctx.tau = tau; +        tnv_ctx.theta = theta; + +        err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_step); +        if (!err) err = hw_sched_wait_task(sched); +        if (err) { fprintf(stderr, "Error %i scheduling tnv threads", err); exit(-1); } + +            // border regions +        for (i = 1; i < tnv_ctx.threads; i++) { +            tnv_thread_t *ctx0 = tnv_ctx.thr_ctx + (i - 1); +            tnv_thread_t *ctx = tnv_ctx.thr_ctx + i; + +            l = ctx0->stepY * dimX * dimZ; +            for(k=0; k < dimZ * (dimX /*- 1*/); k++) { +                double div_upd_add = ctx0->div_upd[l + k]; +                ctx->div_upd[k] += div_upd_add; +                ctx0->div_upd[l + k] = ctx->div_upd[k]; + +//                ctx0->u_upd[l + k] = ctx->u_upd[k]; + +                float divdiff = ctx->div[k] - ctx->div_upd[k];  // Multiple steps... How we compute without history? +                float udiff = ctx->u[k] - ctx->u_upd[k]; +                resprimal += fabs(divtau * udiff + divdiff);  +            } +        } + +#define TNV_CHECK_RES +#ifndef TNV_CHECK_RES +        for (i = 0; i < tnv_ctx.threads; i++) { +            tnv_thread_t *ctx = tnv_ctx.thr_ctx + i; +            resprimal += ctx->resprimal; +            resdual += ctx->resdual; +            product += ctx->product; +            unorm += ctx->unorm; +            qnorm += ctx->qnorm; +        }  +#else +        resprimal = 0; +        float divsigma = 1.0f / sigma; +        for(j=0; j<dimY; j++) +        for(i=0; i<dimX; i++) +        for(int l=0; l<dimZ; l++) +        { +            int step = dimY / tnv_ctx.threads; +            int extra = dimY % tnv_ctx.threads; + +            int thr, subj = j; +            for (thr = 0; thr < tnv_ctx.threads; thr++) { +                int size = step; +                if (thr < extra) size++; +                 +                if (subj >= size) subj-= size; +                else break; +            } + +            tnv_thread_t *ctx = tnv_ctx.thr_ctx + thr; +            +            int k = subj * dimX * dimZ + i * dimZ + l; +         +            float udiff = ctx->u[k] - ctx->u_upd[k]; +            float qxdiff = ctx->qx[k] - ctx->qx_upd[k]; +            float qydiff = ctx->qy[k] - ctx->qy_upd[k]; +            float divdiff = ctx->div[k] - ctx->div_upd[k]; +            float gradxdiff = ctx->gradx[k] - ctx->gradx_upd[k]; +            float gradydiff = ctx->grady[k] - ctx->grady_upd[k]; + +            resprimal += fabs(divtau*udiff + divdiff); +            resdual += fabs(divsigma*qxdiff - gradxdiff); +            resdual += fabs(divsigma*qydiff - gradydiff); + +            unorm += (udiff * udiff); +            qnorm += (qxdiff * qxdiff + qydiff * qydiff); +            product += (gradxdiff * qxdiff + gradydiff * qydiff); +        } +#endif + + + +        residual = (resprimal + resdual) / ((float) (dimX*dimY*dimZ)); +        float b = (2.0f * tau * sigma * product) / (gamma * sigma * unorm + gamma * tau * qnorm); +        float dual_dot_delta = resdual * s * delta; +        float dual_div_delta = (resdual * s) / delta; +        printf("resprimal: %f, resdual: %f, b: %f (product: %f, unorm: %f, qnorm: %f)\n", resprimal, resdual, b, product, unorm, qnorm); + + +        if(b > 1) { +            // Decrease step-sizes to fit balancing principle +            tau = (beta * tau) / b; +            sigma = (beta * sigma) / b; +            alpha = alpha0; + +            err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_restore); +            if (!err) err = hw_sched_wait_task(sched); +            if (err) { fprintf(stderr, "Error %i scheduling restore threads", err); exit(-1); } +        } else { +            err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_copy); +            if (!err) err = hw_sched_wait_task(sched); +            if (err) { fprintf(stderr, "Error %i scheduling copy threads", err); exit(-1); } +         +            if(resprimal > dual_dot_delta) { +                    // Increase primal step-size and decrease dual step-size +                tau = tau / (1.0f - alpha); +                sigma = sigma * (1.0f - alpha); +                alpha = alpha * eta; +            } else if(resprimal < dual_div_delta) { +                    // Decrease primal step-size and increase dual step-size +                tau = tau * (1.0f - alpha); +                sigma = sigma / (1.0f - alpha); +                alpha = alpha * eta; +            } +        } + +        if (residual < tol) break; +    } + +    err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_finish); +    if (!err) err = hw_sched_wait_task(sched); +    if (err) { fprintf(stderr, "Error %i scheduling finish threads", err); exit(-1); } + + +    printf("Iterations stopped at %i with the residual %f \n", iter, residual); +    printf("Return: %f\n", *uT); + +    return *uT; +} diff --git a/src/Core/performance_CPU/TNV_core.c.v17 b/src/Core/performance_CPU/TNV_core.c.v17 new file mode 100755 index 0000000..60739cd --- /dev/null +++ b/src/Core/performance_CPU/TNV_core.c.v17 @@ -0,0 +1,690 @@ +/* + * This work is part of the Core Imaging Library developed by + * Visual Analytics and Imaging System Group of the Science Technology + * Facilities Council, STFC + * + * Copyright 2017 Daniil Kazantsev + * Copyright 2017 Srikanth Nagella, Edoardo Pasca + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "TNV_core.h" + +#define min(a,b) (((a)<(b))?(a):(b)) + +inline void coefF(float *t, float M1, float M2, float M3, float sigma, int p, int q, int r) { +    int ii, num; +    float divsigma = 1.0f / sigma; +    float sum, shrinkfactor; +    float T,D,det,eig1,eig2,sig1,sig2,V1, V2, V3, V4, v0,v1,v2, mu1,mu2,sig1_upd,sig2_upd; +    float proj[2] = {0}; + +    // Compute eigenvalues of M +    T = M1 + M3; +    D = M1 * M3 - M2 * M2; +    det = sqrtf(MAX((T * T / 4.0f) - D, 0.0f)); +    eig1 = MAX((T / 2.0f) + det, 0.0f); +    eig2 = MAX((T / 2.0f) - det, 0.0f); +    sig1 = sqrtf(eig1); +    sig2 = sqrtf(eig2); + +    // Compute normalized eigenvectors +    V1 = V2 = V3 = V4 = 0.0f; + +    if(M2 != 0.0f) +    { +        v0 = M2; +        v1 = eig1 - M3; +        v2 = eig2 - M3; + +        mu1 = sqrtf(v0 * v0 + v1 * v1); +        mu2 = sqrtf(v0 * v0 + v2 * v2); + +        if(mu1 > fTiny) +        { +            V1 = v1 / mu1; +            V3 = v0 / mu1; +        } + +        if(mu2 > fTiny) +        { +            V2 = v2 / mu2; +            V4 = v0 / mu2; +        } + +    } else +    { +        if(M1 > M3) +        { +            V1 = V4 = 1.0f; +            V2 = V3 = 0.0f; + +        } else +        { +            V1 = V4 = 0.0f; +            V2 = V3 = 1.0f; +        } +    } + +    // Compute prox_p of the diagonal entries +    sig1_upd = sig2_upd = 0.0f; + +    if(p == 1) +    { +        sig1_upd = MAX(sig1 - divsigma, 0.0f); +        sig2_upd = MAX(sig2 - divsigma, 0.0f); + +    } else if(p == INFNORM) +    { +        proj[0] = sigma * fabs(sig1); +        proj[1] = sigma * fabs(sig2); + +        /*l1 projection part */ +        sum = fLarge; +        num = 0l; +        shrinkfactor = 0.0f; +        while(sum > 1.0f) +        { +            sum = 0.0f; +            num = 0; + +            for(ii = 0; ii < 2; ii++) +            { +                proj[ii] = MAX(proj[ii] - shrinkfactor, 0.0f); + +                sum += fabs(proj[ii]); +                if(proj[ii]!= 0.0f) +                    num++; +            } + +            if(num > 0) +                shrinkfactor = (sum - 1.0f) / num; +            else +                break; +        } +        /*l1 proj ends*/ + +        sig1_upd = sig1 - divsigma * proj[0]; +        sig2_upd = sig2 - divsigma * proj[1]; +    } + +    // Compute the diagonal entries of $\widehat{\Sigma}\Sigma^{\dagger}_0$ +    if(sig1 > fTiny) +        sig1_upd /= sig1; + +    if(sig2 > fTiny) +        sig2_upd /= sig2; + +    // Compute solution +    t[0] = sig1_upd * V1 * V1 + sig2_upd * V2 * V2; +    t[1] = sig1_upd * V1 * V3 + sig2_upd * V2 * V4; +    t[2] = sig1_upd * V3 * V3 + sig2_upd * V4 * V4; +} + + +#include "hw_sched.h" +typedef struct { +    int offY, stepY, copY; +    float *Input, *u, *u_upd, *qx, *qy, *qx_upd, *qy_upd, *gradx, *grady, *gradx_upd, *grady_upd; +    float *div, *div_upd; +    float resprimal, resdual; +    float unorm, qnorm, product; +} tnv_thread_t; + +typedef struct { +    int threads; +    tnv_thread_t *thr_ctx; +    float *InputT, *uT; +    int dimX, dimY, dimZ, padZ; +    float lambda, sigma, tau, theta; +} tnv_context_t; + +HWSched sched = NULL; +tnv_context_t tnv_ctx; + + +static int tnv_free(HWThread thr, void *hwctx, int device_id, void *data) { +    int i,j,k; +    tnv_context_t *tnv_ctx = (tnv_context_t*)data; +    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + +    free(ctx->Input); +    free(ctx->u); +    free(ctx->u_upd); +    free(ctx->qx); +    free(ctx->qy); +    free(ctx->qx_upd); +    free(ctx->qy_upd); +    free(ctx->gradx); +    free(ctx->grady); +    free(ctx->gradx_upd); +    free(ctx->grady_upd); +    free(ctx->div); +    free(ctx->div_upd); +     +    return 0; +} + +static int tnv_init(HWThread thr, void *hwctx, int device_id, void *data) { +    tnv_context_t *tnv_ctx = (tnv_context_t*)data; +    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; +     +    int dimX = tnv_ctx->dimX; +    int dimY = tnv_ctx->dimY; +    int dimZ = tnv_ctx->dimZ; +    int padZ = tnv_ctx->padZ; +    int offY = ctx->offY; +    int stepY = ctx->stepY; +     +//    printf("%i %p - %i %i %i x %i %i\n", device_id, ctx, dimX, dimY, dimZ, offY, stepY); + +    long DimTotal = (long)(dimX*stepY*padZ); +    long Dim1Total = (long)(dimX*(stepY+1)*padZ); + +    // Auxiliar vectors +    ctx->Input = malloc(Dim1Total * sizeof(float)); +    ctx->u = malloc(Dim1Total * sizeof(float)); +    ctx->u_upd = malloc(Dim1Total * sizeof(float)); +    ctx->qx = malloc(DimTotal * sizeof(float)); +    ctx->qy = malloc(DimTotal * sizeof(float)); +    ctx->qx_upd = malloc(DimTotal * sizeof(float)); +    ctx->qy_upd = malloc(DimTotal * sizeof(float)); +    ctx->gradx = malloc(DimTotal * sizeof(float)); +    ctx->grady = malloc(DimTotal * sizeof(float)); +    ctx->gradx_upd = malloc(DimTotal * sizeof(float)); +    ctx->grady_upd = malloc(DimTotal * sizeof(float)); +    ctx->div = malloc(Dim1Total * sizeof(float)); +    ctx->div_upd = malloc(Dim1Total * sizeof(float)); + +    if ((!ctx->Input)||(!ctx->u)||(!ctx->u_upd)||(!ctx->qx)||(!ctx->qy)||(!ctx->qx_upd)||(!ctx->qy_upd)||(!ctx->gradx)||(!ctx->grady)||(!ctx->gradx_upd)||(!ctx->grady_upd)||(!ctx->div)||(!ctx->div_upd)) { +        fprintf(stderr, "Error allocating memory\n"); +        exit(-1); +    } + +    return 0; +} + +static int tnv_start(HWThread thr, void *hwctx, int device_id, void *data) { +    int i,j,k; +    tnv_context_t *tnv_ctx = (tnv_context_t*)data; +    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; +     +    int dimX = tnv_ctx->dimX; +    int dimY = tnv_ctx->dimY; +    int dimZ = tnv_ctx->dimZ; +    int padZ = tnv_ctx->padZ; +    int offY = ctx->offY; +    int stepY = ctx->stepY; +    int copY = ctx->copY; +     +//    printf("%i %p - %i %i %i (%i) x %i %i\n", device_id, ctx, dimX, dimY, dimZ, padZ, offY, stepY); + +    long DimTotal = (long)(dimX*stepY*padZ); +    long Dim1Total = (long)(dimX*copY*padZ); + +    memset(ctx->u, 0, Dim1Total * sizeof(float)); +    memset(ctx->qx, 0, DimTotal * sizeof(float)); +    memset(ctx->qy, 0, DimTotal * sizeof(float)); +    memset(ctx->gradx, 0, DimTotal * sizeof(float)); +    memset(ctx->grady, 0, DimTotal * sizeof(float)); +    memset(ctx->div, 0, Dim1Total * sizeof(float)); +     +    for(k=0; k<dimZ; k++) { +        for(j=0; j<copY; j++) { +            for(i=0; i<dimX; i++) { +                ctx->Input[j * dimX * padZ + i * padZ + k] =  tnv_ctx->InputT[k * dimX * dimY + (j + offY) * dimX + i]; +                ctx->u[j * dimX * padZ + i * padZ + k] =  tnv_ctx->uT[k * dimX * dimY + (j + offY) * dimX + i]; +            } +        } +    } + +    return 0; +} + +static int tnv_finish(HWThread thr, void *hwctx, int device_id, void *data) { +    int i,j,k; +    tnv_context_t *tnv_ctx = (tnv_context_t*)data; +    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + +    int dimX = tnv_ctx->dimX; +    int dimY = tnv_ctx->dimY; +    int dimZ = tnv_ctx->dimZ; +    int padZ = tnv_ctx->padZ; +    int offY = ctx->offY; +    int stepY = ctx->stepY; +    int copY = ctx->copY; + +    for(k=0; k<dimZ; k++) { +        for(j=0; j<stepY; j++) { +            for(i=0; i<dimX; i++) { +                tnv_ctx->uT[k * dimX * dimY + (j + offY) * dimX + i] = ctx->u[j * dimX * padZ + i * padZ + k]; +            } +        } +    } +     +    return 0; +} + + +static int tnv_copy(HWThread thr, void *hwctx, int device_id, void *data) { +    int i,j,k; +    tnv_context_t *tnv_ctx = (tnv_context_t*)data; +    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; +     +    int dimX = tnv_ctx->dimX; +    int dimY = tnv_ctx->dimY; +    int dimZ = tnv_ctx->dimZ; +    int stepY = ctx->stepY; +    int copY = ctx->copY; +    int padZ = tnv_ctx->padZ; +    long DimTotal = (long)(dimX*stepY*padZ); +    long Dim1Total = (long)(dimX*copY*padZ); + +    // Auxiliar vectors +    memcpy(ctx->u, ctx->u_upd, Dim1Total * sizeof(float)); +    memcpy(ctx->qx, ctx->qx_upd, DimTotal * sizeof(float)); +    memcpy(ctx->qy, ctx->qy_upd, DimTotal * sizeof(float)); +    memcpy(ctx->gradx, ctx->gradx_upd, DimTotal * sizeof(float)); +    memcpy(ctx->grady, ctx->grady_upd, DimTotal * sizeof(float)); +    memcpy(ctx->div, ctx->div_upd, Dim1Total * sizeof(float)); + +    return 0; +} + +static int tnv_restore(HWThread thr, void *hwctx, int device_id, void *data) { +    int i,j,k; +    tnv_context_t *tnv_ctx = (tnv_context_t*)data; +    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; +     +    int dimX = tnv_ctx->dimX; +    int dimY = tnv_ctx->dimY; +    int dimZ = tnv_ctx->dimZ; +    int stepY = ctx->stepY; +    int copY = ctx->copY; +    int padZ = tnv_ctx->padZ; +    long DimTotal = (long)(dimX*stepY*padZ); +    long Dim1Total = (long)(dimX*copY*padZ); + +    // Auxiliar vectors +    memcpy(ctx->u_upd, ctx->u, Dim1Total * sizeof(float)); +    memcpy(ctx->qx_upd, ctx->qx, DimTotal * sizeof(float)); +    memcpy(ctx->qy_upd, ctx->qy, DimTotal * sizeof(float)); +    memcpy(ctx->gradx_upd, ctx->gradx, DimTotal * sizeof(float)); +    memcpy(ctx->grady_upd, ctx->grady, DimTotal * sizeof(float)); +    memcpy(ctx->div_upd, ctx->div, Dim1Total * sizeof(float)); + +    return 0; +} + + +static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) { +    long i, j, k, l, m; + +    tnv_context_t *tnv_ctx = (tnv_context_t*)data; +    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; +     +    int dimX = tnv_ctx->dimX; +    int dimY = tnv_ctx->dimY; +    int dimZ = tnv_ctx->dimZ; +    int padZ = tnv_ctx->padZ; +    int offY = ctx->offY; +    int stepY = ctx->stepY; +    int copY = ctx->copY; + +    float *Input = ctx->Input; +    float *u = ctx->u; +    float *u_upd = ctx->u_upd; +    float *qx = ctx->qx; +    float *qy = ctx->qy; +    float *qx_upd = ctx->qx_upd; +    float *qy_upd = ctx->qy_upd; +    float *gradx = ctx->gradx; +    float *grady = ctx->grady; +    float *gradx_upd = ctx->gradx_upd; +    float *grady_upd = ctx->grady_upd; +    float *div = ctx->div; +    float *div_upd = ctx->div_upd; + +    long p = 1l; +    long q = 1l; +    long r = 0l; + +    float lambda = tnv_ctx->lambda; +    float sigma = tnv_ctx->sigma; +    float tau = tnv_ctx->tau; +    float theta = tnv_ctx->theta; +     +    float taulambda = tau * lambda; +    float divtau = 1.0f / tau; +    float divsigma = 1.0f / sigma; +    float theta1 = 1.0f + theta; +    float constant = 1.0f + taulambda; + +    float resprimal = 0.0f; +    float resdual = 0.0f; +    float product = 0.0f; +    float unorm = 0.0f; +    float qnorm = 0.0f; + +    float udiff[dimZ]; +    float qxdiff; +    float qydiff; +    float divdiff; +    float gradxdiff[dimZ]; +    float gradydiff[dimZ]; + +    for(i=0; i < dimX; i++) { +        for(k = 0; k < dimZ; k++) {  +            int l = i * padZ + k; +            u_upd[l] = (u[l] + tau * div[l] + taulambda * Input[l])/constant; +            div_upd[l] = 0; +        } +    } + +    for(j = 0; j < stepY; j++) { +/*        m = j * dimX * dimZ + (dimX - 1) * dimZ; +        for(k = 0; k < dimZ; k++) {  +            u_upd[k + m] = (u[k + m] + tau * div[k + m] + taulambda * Input[k + m]) / constant;  +        }*/ +         +        for(i = 0; i < dimX/* - 1*/; i++) { +            float t[3]; +            float M1 = 0.0f, M2 = 0.0f, M3 = 0.0f; +            l = (j * dimX  + i) * padZ; +            m = dimX * padZ; +         +//#pragma unroll 64 +            for(k = 0; k < dimZ; k++) { +                u_upd[l + k + m] = (u[l + k + m] + tau * div[l + k + m] + taulambda * Input[l + k + m]) / constant; + +                gradx_upd[l + k] = (i == (dimX - 1))?0:(u_upd[l + k + padZ] - u_upd[l + k]); +                grady_upd[l + k] = (j == (copY - 1))?0:(u_upd[l + k + m] - u_upd[l + k]);   // We need div from the next thread on last iter + +                udiff[k] = u[l + k] - u_upd[l + k]; +                unorm += (udiff[k] * udiff[k]); +//                if ((!k)&&(!i)) printf("%i = %f %f, %f %f\n", offY + j, u[l + k], u_upd[l + k], udiff[k], unorm); + +                gradxdiff[k] = gradx[l + k] - gradx_upd[l + k]; +                gradydiff[k] = grady[l + k] - grady_upd[l + k]; + +                float ubarx = theta1 * gradx_upd[l + k] - theta * gradx[l + k]; +                float ubary = theta1 * grady_upd[l + k] - theta * grady[l + k]; +//#define TNV_NEW_STYLE                 +#ifdef TNV_NEW_STYLE                 +                qx_upd[l + k] = qx[l + k] + sigma * ubarx; +                qy_upd[l + k] = qy[l + k] + sigma * ubary; + +                float vx = divsigma * qx_upd[l + k]; //+ ubarx +                float vy = divsigma * qy_upd[l + k]; //+ ubary +#else +                float vx = ubarx + divsigma * qx[l + k]; +                float vy = ubary + divsigma * qy[l + k]; +#endif + +                M1 += (vx * vx); M2 += (vx * vy); M3 += (vy * vy); +            } + +            coefF(t, M1, M2, M3, sigma, p, q, r); +             +//#pragma unroll 64 +            for(k = 0; k < dimZ; k++) { +#ifdef TNV_NEW_STYLE     +                float vx = divsigma * qx_upd[l + k]; +                float vy = divsigma * qy_upd[l + k]; + +                float gx_upd = vx * t[0] + vy * t[1]; +                float gy_upd = vx * t[1] + vy * t[2]; + +                qx_upd[l + k] -= sigma * gx_upd; +                qy_upd[l + k] -= sigma * gy_upd; +#else +                float ubarx = theta1 * gradx_upd[l + k] - theta * gradx[l + k]; +                float ubary = theta1 * grady_upd[l + k] - theta * grady[l + k]; +                float vx = ubarx + divsigma * qx[l + k]; +                float vy = ubary + divsigma * qy[l + k]; + +                float gx_upd = vx * t[0] + vy * t[1]; +                float gy_upd = vx * t[1] + vy * t[2]; + +                qx_upd[l + k] = qx[l + k] + sigma * (ubarx - gx_upd); +                qy_upd[l + k] = qy[l + k] + sigma * (ubary - gy_upd); +#endif + +if(i != (dimX-1)) { +                div_upd[l + k] += qx_upd[l + k]; +                div_upd[l + k + padZ] -= qx_upd[l + k]; +} +if(j != (copY-1)) { +                div_upd[l + k] += qy_upd[l + k]; +                div_upd[l + k + m] = -qy_upd[l + k];  // We need to update div in the next thread on last iter +} + +                qxdiff = qx[l + k] - qx_upd[l + k]; +                qydiff = qy[l + k] - qy_upd[l + k]; +                qnorm += (qxdiff * qxdiff + qydiff * qydiff); + +                resdual += fabs(divsigma * qxdiff - gradxdiff[k]); +                resdual += fabs(divsigma * qydiff - gradydiff[k]); +                product += (gradxdiff[k] * qxdiff + gradydiff[k] * qydiff); + +                if ((offY == 0)||(j > 0)) { +                    divdiff = div[l + k] - div_upd[l + k];  // Multiple steps... How we compute without history? +                    resprimal += fabs(divtau * udiff[k] + divdiff);  +                } +            } +             +        } // i +    } +     +     +    ctx->resprimal = resprimal; +    ctx->resdual = resdual; +    ctx->product = product; +    ctx->unorm = unorm; +    ctx->qnorm = qnorm; + +    return 0; +} + +static void TNV_CPU_init(float *InputT, float *uT, int dimX, int dimY, int dimZ) { +    int i, off, size, err; + +    if (sched) return; + +    tnv_ctx.dimX = dimX; +    tnv_ctx.dimY = dimY; +    tnv_ctx.dimZ = dimZ; +        // Padding seems actually slower +//    tnv_ctx.padZ = 64 * ((dimZ / 64) + ((dimZ % 64)?1:0)); +    tnv_ctx.padZ = dimZ; +     +    hw_sched_init(); + +    int threads = hw_sched_get_cpu_count(); +    if (threads > dimY) threads = dimY/2; + +    int step = dimY / threads; +    int extra = dimY % threads; + +    tnv_ctx.threads = threads; +    tnv_ctx.thr_ctx = (tnv_thread_t*)calloc(threads, sizeof(tnv_thread_t)); +    for (i = 0, off = 0; i < threads; i++, off += size) { +        tnv_thread_t *ctx = tnv_ctx.thr_ctx + i; +        size = step + ((i < extra)?1:0); + +        ctx->offY = off; +        ctx->stepY = size; + +        if (i == (threads-1)) ctx->copY = ctx->stepY; +        else ctx->copY = ctx->stepY + 1; +    } + +    sched = hw_sched_create(threads); +    if (!sched) { fprintf(stderr, "Error creating threads\n"); exit(-1); } + +    err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_init); +    if (!err) err = hw_sched_wait_task(sched); +    if (err) { fprintf(stderr, "Error %i scheduling init threads", err); exit(-1); } +} + + + +/* + * C-OMP implementation of Total Nuclear Variation regularisation model (2D + channels) [1] + * The code is modified from the implementation by Joan Duran <joan.duran@uib.es> see + * "denoisingPDHG_ipol.cpp" in Joans Collaborative Total Variation package + * + * Input Parameters: + * 1. Noisy volume of 2D + channel dimension, i.e. 3D volume + * 2. lambda - regularisation parameter + * 3. Number of iterations [OPTIONAL parameter] + * 4. eplsilon - tolerance constant [OPTIONAL parameter] + * 5. print information: 0 (off) or 1 (on)  [OPTIONAL parameter] + * + * Output: + * 1. Filtered/regularized image (u) + * + * [1]. Duran, J., Moeller, M., Sbert, C. and Cremers, D., 2016. Collaborative total variation: a general framework for vectorial TV models. SIAM Journal on Imaging Sciences, 9(1), pp.116-151. + */ + +float TNV_CPU_main(float *InputT, float *uT, float lambda, int maxIter, float tol, int dimX, int dimY, int dimZ) +{ +    int err; +    int iter; +    int i,j,k,l,m; + +    lambda = 1.0f/(2.0f*lambda); +    tnv_ctx.lambda = lambda; + +    // PDHG algorithm parameters +    float tau = 0.5f; +    float sigma = 0.5f; +    float theta = 1.0f; + +    // Backtracking parameters +    float s = 1.0f; +    float gamma = 0.75f; +    float beta = 0.95f; +    float alpha0 = 0.2f; +    float alpha = alpha0; +    float delta = 1.5f; +    float eta = 0.95f; + +    TNV_CPU_init(InputT, uT, dimX, dimY, dimZ); + +    tnv_ctx.InputT = InputT; +    tnv_ctx.uT = uT; +     +    int padZ = tnv_ctx.padZ; + +    err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_start); +    if (!err) err = hw_sched_wait_task(sched); +    if (err) { fprintf(stderr, "Error %i scheduling start threads", err); exit(-1); } + + +    // Apply Primal-Dual Hybrid Gradient scheme +    float residual = fLarge; +    for(iter = 0; iter < maxIter; iter++)   { +        float resprimal = 0.0f; +        float resdual = 0.0f; +        float product = 0.0f; +        float unorm = 0.0f; +        float qnorm = 0.0f; + +        float divtau = 1.0f / tau; + +        tnv_ctx.sigma = sigma; +        tnv_ctx.tau = tau; +        tnv_ctx.theta = theta; + +        err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_step); +        if (!err) err = hw_sched_wait_task(sched); +        if (err) { fprintf(stderr, "Error %i scheduling tnv threads", err); exit(-1); } + +            // border regions +        for (j = 1; j < tnv_ctx.threads; j++) { +            tnv_thread_t *ctx0 = tnv_ctx.thr_ctx + (j - 1); +            tnv_thread_t *ctx = tnv_ctx.thr_ctx + j; + +            m = ctx0->stepY * dimX * padZ; +            for(i = 0; i < dimX; i++) { +                for(k = 0; k < dimZ; k++) { +                    int l = i * padZ + k; +                         +                    float div_upd_add = ctx0->div_upd[m + l]; +                    ctx->div_upd[l] += div_upd_add; +                    ctx0->div_upd[m + l] = ctx->div_upd[l]; +                    //ctx0->u_upd[m + l] = ctx->u_upd[l]; + +                    float divdiff = ctx->div[l] - ctx->div_upd[l];  // Multiple steps... How we compute without history? +                    float udiff = ctx->u[l] - ctx->u_upd[l]; +                    resprimal += fabs(divtau * udiff + divdiff);  +                } +            } +        } + +        for (j = 0; j < tnv_ctx.threads; j++) { +            tnv_thread_t *ctx = tnv_ctx.thr_ctx + j; +            resprimal += ctx->resprimal; +            resdual += ctx->resdual; +            product += ctx->product; +            unorm += ctx->unorm; +            qnorm += ctx->qnorm; +        }  + +        residual = (resprimal + resdual) / ((float) (dimX*dimY*dimZ)); +        float b = (2.0f * tau * sigma * product) / (gamma * sigma * unorm + gamma * tau * qnorm); +        float dual_dot_delta = resdual * s * delta; +        float dual_div_delta = (resdual * s) / delta; +        printf("resprimal: %f, resdual: %f, b: %f (product: %f, unorm: %f, qnorm: %f)\n", resprimal, resdual, b, product, unorm, qnorm); + + +        if(b > 1) { +            // Decrease step-sizes to fit balancing principle +            tau = (beta * tau) / b; +            sigma = (beta * sigma) / b; +            alpha = alpha0; + +            err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_restore); +            if (!err) err = hw_sched_wait_task(sched); +            if (err) { fprintf(stderr, "Error %i scheduling restore threads", err); exit(-1); } +        } else { +            err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_copy); +            if (!err) err = hw_sched_wait_task(sched); +            if (err) { fprintf(stderr, "Error %i scheduling copy threads", err); exit(-1); } +         +            if(resprimal > dual_dot_delta) { +                    // Increase primal step-size and decrease dual step-size +                tau = tau / (1.0f - alpha); +                sigma = sigma * (1.0f - alpha); +                alpha = alpha * eta; +            } else if(resprimal < dual_div_delta) { +                    // Decrease primal step-size and increase dual step-size +                tau = tau * (1.0f - alpha); +                sigma = sigma / (1.0f - alpha); +                alpha = alpha * eta; +            } +        } + +        if (residual < tol) break; +    } + +    err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_finish); +    if (!err) err = hw_sched_wait_task(sched); +    if (err) { fprintf(stderr, "Error %i scheduling finish threads", err); exit(-1); } + + +    printf("Iterations stopped at %i with the residual %f \n", iter, residual); +    printf("Return: %f\n", *uT); + +    return *uT; +} diff --git a/src/Core/performance_CPU/TNV_core.c.v18 b/src/Core/performance_CPU/TNV_core.c.v18 new file mode 100755 index 0000000..7192475 --- /dev/null +++ b/src/Core/performance_CPU/TNV_core.c.v18 @@ -0,0 +1,688 @@ +/* + * This work is part of the Core Imaging Library developed by + * Visual Analytics and Imaging System Group of the Science Technology + * Facilities Council, STFC + * + * Copyright 2017 Daniil Kazantsev + * Copyright 2017 Srikanth Nagella, Edoardo Pasca + * + * Copyriht 2020 Suren A. Chlingaryan + * Optimized version with 1/2 of memory consumption and ~4x performance + * This version is algorithmicly comptable with the original, but slight change in results  + * is expected due to different order of floating-point operations. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <malloc.h> +#include "TNV_core.h" + +#define BLOCK 32 +#define min(a,b) (((a)<(b))?(a):(b)) + +inline void coefF(float *t, float M1, float M2, float M3, float sigma, int p, int q, int r) { +    int ii, num; +    float divsigma = 1.0f / sigma; +    float sum, shrinkfactor; +    float T,D,det,eig1,eig2,sig1,sig2,V1, V2, V3, V4, v0,v1,v2, mu1,mu2,sig1_upd,sig2_upd; +    float proj[2] = {0}; + +    // Compute eigenvalues of M +    T = M1 + M3; +    D = M1 * M3 - M2 * M2; +    det = sqrtf(MAX((T * T / 4.0f) - D, 0.0f)); +    eig1 = MAX((T / 2.0f) + det, 0.0f); +    eig2 = MAX((T / 2.0f) - det, 0.0f); +    sig1 = sqrtf(eig1); +    sig2 = sqrtf(eig2); + +    // Compute normalized eigenvectors +    V1 = V2 = V3 = V4 = 0.0f; + +    if(M2 != 0.0f) +    { +        v0 = M2; +        v1 = eig1 - M3; +        v2 = eig2 - M3; + +        mu1 = sqrtf(v0 * v0 + v1 * v1); +        mu2 = sqrtf(v0 * v0 + v2 * v2); + +        if(mu1 > fTiny) +        { +            V1 = v1 / mu1; +            V3 = v0 / mu1; +        } + +        if(mu2 > fTiny) +        { +            V2 = v2 / mu2; +            V4 = v0 / mu2; +        } + +    } else +    { +        if(M1 > M3) +        { +            V1 = V4 = 1.0f; +            V2 = V3 = 0.0f; + +        } else +        { +            V1 = V4 = 0.0f; +            V2 = V3 = 1.0f; +        } +    } + +    // Compute prox_p of the diagonal entries +    sig1_upd = sig2_upd = 0.0f; + +    if(p == 1) +    { +        sig1_upd = MAX(sig1 - divsigma, 0.0f); +        sig2_upd = MAX(sig2 - divsigma, 0.0f); + +    } else if(p == INFNORM) +    { +        proj[0] = sigma * fabs(sig1); +        proj[1] = sigma * fabs(sig2); + +        /*l1 projection part */ +        sum = fLarge; +        num = 0l; +        shrinkfactor = 0.0f; +        while(sum > 1.0f) +        { +            sum = 0.0f; +            num = 0; + +            for(ii = 0; ii < 2; ii++) +            { +                proj[ii] = MAX(proj[ii] - shrinkfactor, 0.0f); + +                sum += fabs(proj[ii]); +                if(proj[ii]!= 0.0f) +                    num++; +            } + +            if(num > 0) +                shrinkfactor = (sum - 1.0f) / num; +            else +                break; +        } +        /*l1 proj ends*/ + +        sig1_upd = sig1 - divsigma * proj[0]; +        sig2_upd = sig2 - divsigma * proj[1]; +    } + +    // Compute the diagonal entries of $\widehat{\Sigma}\Sigma^{\dagger}_0$ +    if(sig1 > fTiny) +        sig1_upd /= sig1; + +    if(sig2 > fTiny) +        sig2_upd /= sig2; + +    // Compute solution +    t[0] = sig1_upd * V1 * V1 + sig2_upd * V2 * V2; +    t[1] = sig1_upd * V1 * V3 + sig2_upd * V2 * V4; +    t[2] = sig1_upd * V3 * V3 + sig2_upd * V4 * V4; +} + + +#include "hw_sched.h" +typedef struct { +    int offY, stepY, copY; +    float *Input, *u, *u_upd, *qx, *qy, *qx_upd, *qy_upd, *gradx, *grady, *gradx_upd, *grady_upd; +    float *div, *div_upd; +    float resprimal, resdual; +    float unorm, qnorm, product; +} tnv_thread_t; + +typedef struct { +    int threads; +    tnv_thread_t *thr_ctx; +    float *InputT, *uT; +    int dimX, dimY, dimZ, padZ; +    float lambda, sigma, tau, theta; +} tnv_context_t; + +HWSched sched = NULL; +tnv_context_t tnv_ctx; + + +static int tnv_free(HWThread thr, void *hwctx, int device_id, void *data) { +    int i,j,k; +    tnv_context_t *tnv_ctx = (tnv_context_t*)data; +    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + +    free(ctx->Input); +    free(ctx->u); +    free(ctx->u_upd); +    free(ctx->qx); +    free(ctx->qy); +    free(ctx->qx_upd); +    free(ctx->qy_upd); +    free(ctx->gradx); +    free(ctx->grady); +    free(ctx->gradx_upd); +    free(ctx->grady_upd); +    free(ctx->div); +    free(ctx->div_upd); +     +    return 0; +} + +static int tnv_init(HWThread thr, void *hwctx, int device_id, void *data) { +    tnv_context_t *tnv_ctx = (tnv_context_t*)data; +    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; +     +    int dimX = tnv_ctx->dimX; +    int dimY = tnv_ctx->dimY; +    int dimZ = tnv_ctx->dimZ; +    int padZ = tnv_ctx->padZ; +    int offY = ctx->offY; +    int stepY = ctx->stepY; +     +//    printf("%i %p - %i %i %i x %i %i\n", device_id, ctx, dimX, dimY, dimZ, offY, stepY); + +    long DimTotal = (long)(dimX*stepY*padZ); +    long Dim1Total = (long)(dimX*(stepY+1)*padZ); + +    // Auxiliar vectors +    ctx->Input = memalign(64, Dim1Total * sizeof(float)); +    ctx->u = memalign(64, Dim1Total * sizeof(float)); +    ctx->u_upd = memalign(64, Dim1Total * sizeof(float)); +    ctx->qx = memalign(64, DimTotal * sizeof(float)); +    ctx->qy = memalign(64, DimTotal * sizeof(float)); +    ctx->qx_upd = memalign(64, DimTotal * sizeof(float)); +    ctx->qy_upd = memalign(64, DimTotal * sizeof(float)); +    ctx->gradx = memalign(64, DimTotal * sizeof(float)); +    ctx->grady = memalign(64, DimTotal * sizeof(float)); +    ctx->gradx_upd = memalign(64, DimTotal * sizeof(float)); +    ctx->grady_upd = memalign(64, DimTotal * sizeof(float)); +    ctx->div = memalign(64, Dim1Total * sizeof(float)); +    ctx->div_upd = malloc(Dim1Total * sizeof(float)); + +    if ((!ctx->Input)||(!ctx->u)||(!ctx->u_upd)||(!ctx->qx)||(!ctx->qy)||(!ctx->qx_upd)||(!ctx->qy_upd)||(!ctx->gradx)||(!ctx->grady)||(!ctx->gradx_upd)||(!ctx->grady_upd)||(!ctx->div)||(!ctx->div_upd)) { +        fprintf(stderr, "Error allocating memory\n"); +        exit(-1); +    } + +    return 0; +} + +static int tnv_start(HWThread thr, void *hwctx, int device_id, void *data) { +    int i,j,k; +    tnv_context_t *tnv_ctx = (tnv_context_t*)data; +    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; +     +    int dimX = tnv_ctx->dimX; +    int dimY = tnv_ctx->dimY; +    int dimZ = tnv_ctx->dimZ; +    int padZ = tnv_ctx->padZ; +    int offY = ctx->offY; +    int stepY = ctx->stepY; +    int copY = ctx->copY; +     +//    printf("%i %p - %i %i %i (%i) x %i %i\n", device_id, ctx, dimX, dimY, dimZ, padZ, offY, stepY); + +    long DimTotal = (long)(dimX*stepY*padZ); +    long Dim1Total = (long)(dimX*copY*padZ); + +    memset(ctx->u, 0, Dim1Total * sizeof(float)); +    memset(ctx->qx, 0, DimTotal * sizeof(float)); +    memset(ctx->qy, 0, DimTotal * sizeof(float)); +    memset(ctx->gradx, 0, DimTotal * sizeof(float)); +    memset(ctx->grady, 0, DimTotal * sizeof(float)); +    memset(ctx->div, 0, Dim1Total * sizeof(float)); +     +    for(k=0; k<dimZ; k++) { +        for(j=0; j<copY; j++) { +            for(i=0; i<dimX; i++) { +                ctx->Input[j * dimX * padZ + i * padZ + k] =  tnv_ctx->InputT[k * dimX * dimY + (j + offY) * dimX + i]; +                ctx->u[j * dimX * padZ + i * padZ + k] =  tnv_ctx->uT[k * dimX * dimY + (j + offY) * dimX + i]; +            } +        } +    } + +    return 0; +} + +static int tnv_finish(HWThread thr, void *hwctx, int device_id, void *data) { +    int i,j,k; +    tnv_context_t *tnv_ctx = (tnv_context_t*)data; +    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + +    int dimX = tnv_ctx->dimX; +    int dimY = tnv_ctx->dimY; +    int dimZ = tnv_ctx->dimZ; +    int padZ = tnv_ctx->padZ; +    int offY = ctx->offY; +    int stepY = ctx->stepY; +    int copY = ctx->copY; + +    for(k=0; k<dimZ; k++) { +        for(j=0; j<stepY; j++) { +            for(i=0; i<dimX; i++) { +                tnv_ctx->uT[k * dimX * dimY + (j + offY) * dimX + i] = ctx->u[j * dimX * padZ + i * padZ + k]; +            } +        } +    } +     +    return 0; +} + + +static int tnv_copy(HWThread thr, void *hwctx, int device_id, void *data) { +    int i,j,k; +    tnv_context_t *tnv_ctx = (tnv_context_t*)data; +    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; +     +    int dimX = tnv_ctx->dimX; +    int dimY = tnv_ctx->dimY; +    int dimZ = tnv_ctx->dimZ; +    int stepY = ctx->stepY; +    int copY = ctx->copY; +    int padZ = tnv_ctx->padZ; +    long DimTotal = (long)(dimX*stepY*padZ); +    long Dim1Total = (long)(dimX*copY*padZ); + +    // Auxiliar vectors +    memcpy(ctx->u, ctx->u_upd, Dim1Total * sizeof(float)); +    memcpy(ctx->qx, ctx->qx_upd, DimTotal * sizeof(float)); +    memcpy(ctx->qy, ctx->qy_upd, DimTotal * sizeof(float)); +    memcpy(ctx->gradx, ctx->gradx_upd, DimTotal * sizeof(float)); +    memcpy(ctx->grady, ctx->grady_upd, DimTotal * sizeof(float)); +    memcpy(ctx->div, ctx->div_upd, Dim1Total * sizeof(float)); + +    return 0; +} + +static int tnv_restore(HWThread thr, void *hwctx, int device_id, void *data) { +    int i,j,k; +    tnv_context_t *tnv_ctx = (tnv_context_t*)data; +    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; +     +    int dimX = tnv_ctx->dimX; +    int dimY = tnv_ctx->dimY; +    int dimZ = tnv_ctx->dimZ; +    int stepY = ctx->stepY; +    int copY = ctx->copY; +    int padZ = tnv_ctx->padZ; +    long DimTotal = (long)(dimX*stepY*padZ); +    long Dim1Total = (long)(dimX*copY*padZ); + +    // Auxiliar vectors +    memcpy(ctx->u_upd, ctx->u, Dim1Total * sizeof(float)); +    memcpy(ctx->qx_upd, ctx->qx, DimTotal * sizeof(float)); +    memcpy(ctx->qy_upd, ctx->qy, DimTotal * sizeof(float)); +    memcpy(ctx->gradx_upd, ctx->gradx, DimTotal * sizeof(float)); +    memcpy(ctx->grady_upd, ctx->grady, DimTotal * sizeof(float)); +    memcpy(ctx->div_upd, ctx->div, Dim1Total * sizeof(float)); + +    return 0; +} + + +static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) { +    long i, j, k, l; + +    tnv_context_t *tnv_ctx = (tnv_context_t*)data; +    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; +     +    int dimX = tnv_ctx->dimX; +    int dimY = tnv_ctx->dimY; +    int dimZ = tnv_ctx->dimZ; +    int padZ = tnv_ctx->padZ; +    int offY = ctx->offY; +    int stepY = ctx->stepY; +    int copY = ctx->copY; + +    float *Input = ctx->Input; +    float *u = ctx->u; +    float *u_upd = ctx->u_upd; +    float *qx = ctx->qx; +    float *qy = ctx->qy; +    float *qx_upd = ctx->qx_upd; +    float *qy_upd = ctx->qy_upd; +    float *gradx = ctx->gradx; +    float *grady = ctx->grady; +    float *gradx_upd = ctx->gradx_upd; +    float *grady_upd = ctx->grady_upd; +    float *div = ctx->div; +    float *div_upd = ctx->div_upd; + +    long p = 1l; +    long q = 1l; +    long r = 0l; + +    float lambda = tnv_ctx->lambda; +    float sigma = tnv_ctx->sigma; +    float tau = tnv_ctx->tau; +    float theta = tnv_ctx->theta; +     +    float taulambda = tau * lambda; +    float divtau = 1.0f / tau; +    float divsigma = 1.0f / sigma; +    float theta1 = 1.0f + theta; +    float constant = 1.0f + taulambda; + +    float resprimal = 0.0; +    float resdual = 0.0; +    float product = 0.0; +    float unorm = 0.0; +    float qnorm = 0.0; + +    float udiff[dimZ] __attribute__((aligned(64))); +    float qxdiff __attribute__((aligned(64))); +    float qydiff __attribute__((aligned(64))); +    float divdiff __attribute__((aligned(64))); +    float gradxdiff[dimZ] __attribute__((aligned(64))); +    float gradydiff[dimZ] __attribute__((aligned(64))); + +    for(int j1 = 0; j1 < stepY; j1 += BLOCK) { +     for(int i1 = 0; i1 < dimX; i1 += BLOCK) { +      for(int j2 = 0; j2 < BLOCK; j2++) { +        j = j1 + j2; +        for(int i2 = 0; i2 < BLOCK; i2++) { +            float t[3]; +            float M1 = 0.0f, M2 = 0.0f, M3 = 0.0f; + +            i = i1 + i2; +            if (i == dimX) break; +            if (j == stepY) { j2 = BLOCK; break; } +            l = (j * dimX  + i) * padZ; +         +//#pragma vector aligned +#pragma GCC ivdep  +            for(k = 0; k < dimZ; k++) { +                u_upd[l + k] = (u[l + k] + tau * div[l + k] + taulambda * Input[l + k]) / constant; +                udiff[k] = u[l + k] - u_upd[l + k]; +                unorm += (udiff[k] * udiff[k]); + +                gradx_upd[l + k] = (i == (dimX - 1))?0:((u[l + k + padZ] + tau * div[l + k + padZ] + taulambda * Input[l + k + padZ]) / constant - u_upd[l + k]); +                grady_upd[l + k] = (j == (copY - 1))?0:((u[l + k + dimX*padZ] + tau * div[l + k + dimX*padZ] + taulambda * Input[l + k + dimX*padZ]) / constant - u_upd[l + k]); +                gradxdiff[k] = gradx[l + k] - gradx_upd[l + k]; +                gradydiff[k] = grady[l + k] - grady_upd[l + k]; + +                float ubarx = theta1 * gradx_upd[l + k] - theta * gradx[l + k]; +                float ubary = theta1 * grady_upd[l + k] - theta * grady[l + k]; +//#define TNV_NEW_STYLE                 +#ifdef TNV_NEW_STYLE                 +                qx_upd[l + k] = qx[l + k] + sigma * ubarx; +                qy_upd[l + k] = qy[l + k] + sigma * ubary; + +                float vx = divsigma * qx_upd[l + k]; //+ ubarx +                float vy = divsigma * qy_upd[l + k]; //+ ubary +#else +                float vx = ubarx + divsigma * qx[l + k]; +                float vy = ubary + divsigma * qy[l + k]; +#endif + +                M1 += (vx * vx); M2 += (vx * vy); M3 += (vy * vy); +            } + +            coefF(t, M1, M2, M3, sigma, p, q, r); +             +//#pragma vector aligned +#pragma GCC ivdep  +            for(k = 0; k < dimZ; k++) { +#ifdef TNV_NEW_STYLE     +                float vx = divsigma * qx_upd[l + k]; +                float vy = divsigma * qy_upd[l + k]; + +                float gx_upd = vx * t[0] + vy * t[1]; +                float gy_upd = vx * t[1] + vy * t[2]; + +                qx_upd[l + k] -= sigma * gx_upd; +                qy_upd[l + k] -= sigma * gy_upd; +#else +                float ubarx = theta1 * gradx_upd[l + k] - theta * gradx[l + k]; +                float ubary = theta1 * grady_upd[l + k] - theta * grady[l + k]; +                float vx = ubarx + divsigma * qx[l + k]; +                float vy = ubary + divsigma * qy[l + k]; + +                float gx_upd = vx * t[0] + vy * t[1]; +                float gy_upd = vx * t[1] + vy * t[2]; + +                qx_upd[l + k] = qx[l + k] + sigma * (ubarx - gx_upd); +                qy_upd[l + k] = qy[l + k] + sigma * (ubary - gy_upd); +#endif + +                float div_upd_val = 0; +                div_upd_val -= (i > 0)?qx_upd[l + k - padZ]:0; +                div_upd_val -= (j > 0)?qy_upd[l + k - dimX * padZ]:0; +                div_upd_val += (i < (dimX-1))?qx_upd[l + k]:0; +                div_upd_val += (j < (copY-1))?qy_upd[l + k]:0; +                div_upd[l + k] = div_upd_val; + +                qxdiff = qx[l + k] - qx_upd[l + k]; +                qydiff = qy[l + k] - qy_upd[l + k]; +                qnorm += (qxdiff * qxdiff + qydiff * qydiff); + +                resdual += fabs(divsigma * qxdiff - gradxdiff[k]); +                resdual += fabs(divsigma * qydiff - gradydiff[k]); +                product += (gradxdiff[k] * qxdiff + gradydiff[k] * qydiff); + +                if ((offY == 0)||(j > 0)) { +                    divdiff = div[l + k] - div_upd[l + k];  // Multiple steps... How we compute without history? +                    resprimal += fabs(divtau * udiff[k] + divdiff);  +                } +            } +             +        } // i +       } // j +      } // i +    } // j +     +     +    ctx->resprimal = resprimal; +    ctx->resdual = resdual; +    ctx->product = product; +    ctx->unorm = unorm; +    ctx->qnorm = qnorm; + +    return 0; +} + +static void TNV_CPU_init(float *InputT, float *uT, int dimX, int dimY, int dimZ) { +    int i, off, size, err; + +    if (sched) return; + +    tnv_ctx.dimX = dimX; +    tnv_ctx.dimY = dimY; +    tnv_ctx.dimZ = dimZ; +        // Padding seems actually slower +    tnv_ctx.padZ = dimZ; +//    tnv_ctx.padZ = 16 * ((dimZ / 16) + ((dimZ % 16)?1:0)); +     +    hw_sched_init(); + +    int threads = hw_sched_get_cpu_count(); +    if (threads > dimY) threads = dimY/2; + +    int step = dimY / threads; +    int extra = dimY % threads; + +    tnv_ctx.threads = threads; +    tnv_ctx.thr_ctx = (tnv_thread_t*)calloc(threads, sizeof(tnv_thread_t)); +    for (i = 0, off = 0; i < threads; i++, off += size) { +        tnv_thread_t *ctx = tnv_ctx.thr_ctx + i; +        size = step + ((i < extra)?1:0); + +        ctx->offY = off; +        ctx->stepY = size; + +        if (i == (threads-1)) ctx->copY = ctx->stepY; +        else ctx->copY = ctx->stepY + 1; +    } + +    sched = hw_sched_create(threads); +    if (!sched) { fprintf(stderr, "Error creating threads\n"); exit(-1); } + +    err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_init); +    if (!err) err = hw_sched_wait_task(sched); +    if (err) { fprintf(stderr, "Error %i scheduling init threads", err); exit(-1); } +} + + + +/* + * C-OMP implementation of Total Nuclear Variation regularisation model (2D + channels) [1] + * The code is modified from the implementation by Joan Duran <joan.duran@uib.es> see + * "denoisingPDHG_ipol.cpp" in Joans Collaborative Total Variation package + * + * Input Parameters: + * 1. Noisy volume of 2D + channel dimension, i.e. 3D volume + * 2. lambda - regularisation parameter + * 3. Number of iterations [OPTIONAL parameter] + * 4. eplsilon - tolerance constant [OPTIONAL parameter] + * 5. print information: 0 (off) or 1 (on)  [OPTIONAL parameter] + * + * Output: + * 1. Filtered/regularized image (u) + * + * [1]. Duran, J., Moeller, M., Sbert, C. and Cremers, D., 2016. Collaborative total variation: a general framework for vectorial TV models. SIAM Journal on Imaging Sciences, 9(1), pp.116-151. + */ + +float TNV_CPU_main(float *InputT, float *uT, float lambda, int maxIter, float tol, int dimX, int dimY, int dimZ) +{ +    int err; +    int iter; +    int i,j,k,l,m; + +    lambda = 1.0f/(2.0f*lambda); +    tnv_ctx.lambda = lambda; + +    // PDHG algorithm parameters +    float tau = 0.5f; +    float sigma = 0.5f; +    float theta = 1.0f; + +    // Backtracking parameters +    float s = 1.0f; +    float gamma = 0.75f; +    float beta = 0.95f; +    float alpha0 = 0.2f; +    float alpha = alpha0; +    float delta = 1.5f; +    float eta = 0.95f; + +    TNV_CPU_init(InputT, uT, dimX, dimY, dimZ); + +    tnv_ctx.InputT = InputT; +    tnv_ctx.uT = uT; +     +    int padZ = tnv_ctx.padZ; + +    err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_start); +    if (!err) err = hw_sched_wait_task(sched); +    if (err) { fprintf(stderr, "Error %i scheduling start threads", err); exit(-1); } + + +    // Apply Primal-Dual Hybrid Gradient scheme +    float residual = fLarge; +    for(iter = 0; iter < maxIter; iter++)   { +        float resprimal = 0.0f; +        float resdual = 0.0f; +        float product = 0.0f; +        float unorm = 0.0f; +        float qnorm = 0.0f; + +        float divtau = 1.0f / tau; + +        tnv_ctx.sigma = sigma; +        tnv_ctx.tau = tau; +        tnv_ctx.theta = theta; + +        err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_step); +        if (!err) err = hw_sched_wait_task(sched); +        if (err) { fprintf(stderr, "Error %i scheduling tnv threads", err); exit(-1); } + +            // border regions +        for (j = 1; j < tnv_ctx.threads; j++) { +            tnv_thread_t *ctx0 = tnv_ctx.thr_ctx + (j - 1); +            tnv_thread_t *ctx = tnv_ctx.thr_ctx + j; + +            m = ctx0->stepY  * dimX * padZ; +            for(i = 0; i < dimX; i++) { +                for(k = 0; k < dimZ; k++) { +                    int l = i * padZ + k; +                         +                    ctx->div_upd[l] -= ctx0->qy_upd[m - dimX * padZ + l]; +                    ctx0->div_upd[m + l] = ctx->div_upd[l]; +                    ctx0->u_upd[m + l] = ctx->u_upd[l]; + +                    float divdiff = ctx->div[l] - ctx->div_upd[l]; +                    float udiff = ctx->u[l] - ctx->u_upd[l]; +                    resprimal += fabs(divtau * udiff + divdiff);  +                } +            } +        } + +        for (j = 0; j < tnv_ctx.threads; j++) { +            tnv_thread_t *ctx = tnv_ctx.thr_ctx + j; +            resprimal += ctx->resprimal; +            resdual += ctx->resdual; +            product += ctx->product; +            unorm += ctx->unorm; +            qnorm += ctx->qnorm; +        }  + +        residual = (resprimal + resdual) / ((float) (dimX*dimY*dimZ)); +        float b = (2.0f * tau * sigma * product) / (gamma * sigma * unorm + gamma * tau * qnorm); +        float dual_dot_delta = resdual * s * delta; +        float dual_div_delta = (resdual * s) / delta; +        printf("resprimal: %f, resdual: %f, b: %f (product: %f, unorm: %f, qnorm: %f)\n", resprimal, resdual, b, product, unorm, qnorm); + + +        if(b > 1) { +            // Decrease step-sizes to fit balancing principle +            tau = (beta * tau) / b; +            sigma = (beta * sigma) / b; +            alpha = alpha0; + +            err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_restore); +            if (!err) err = hw_sched_wait_task(sched); +            if (err) { fprintf(stderr, "Error %i scheduling restore threads", err); exit(-1); } +        } else { +            err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_copy); +            if (!err) err = hw_sched_wait_task(sched); +            if (err) { fprintf(stderr, "Error %i scheduling copy threads", err); exit(-1); } +         +            if(resprimal > dual_dot_delta) { +                    // Increase primal step-size and decrease dual step-size +                tau = tau / (1.0f - alpha); +                sigma = sigma * (1.0f - alpha); +                alpha = alpha * eta; +            } else if(resprimal < dual_div_delta) { +                    // Decrease primal step-size and increase dual step-size +                tau = tau * (1.0f - alpha); +                sigma = sigma / (1.0f - alpha); +                alpha = alpha * eta; +            } +        } + +        if (residual < tol) break; +    } + +    err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_finish); +    if (!err) err = hw_sched_wait_task(sched); +    if (err) { fprintf(stderr, "Error %i scheduling finish threads", err); exit(-1); } + + +    printf("Iterations stopped at %i with the residual %f \n", iter, residual); +    printf("Return: %f\n", *uT); + +    return *uT; +} diff --git a/src/Core/performance_CPU/TNV_core.c.v19 b/src/Core/performance_CPU/TNV_core.c.v19 new file mode 100755 index 0000000..9b19ed5 --- /dev/null +++ b/src/Core/performance_CPU/TNV_core.c.v19 @@ -0,0 +1,681 @@ +/* + * This work is part of the Core Imaging Library developed by + * Visual Analytics and Imaging System Group of the Science Technology + * Facilities Council, STFC + * + * Copyright 2017 Daniil Kazantsev + * Copyright 2017 Srikanth Nagella, Edoardo Pasca + * + * Copyriht 2020 Suren A. Chlingaryan + * Optimized version with 1/2 of memory consumption and ~4x performance + * This version is algorithmicly comptable with the original, but slight change in results  + * is expected due to different order of floating-point operations. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <malloc.h> +#include "TNV_core.h" + +#define BLOCK 32 +#define min(a,b) (((a)<(b))?(a):(b)) + +inline void coefF(float *t, float M1, float M2, float M3, float sigma, int p, int q, int r) { +    int ii, num; +    float divsigma = 1.0f / sigma; +    float sum, shrinkfactor; +    float T,D,det,eig1,eig2,sig1,sig2,V1, V2, V3, V4, v0,v1,v2, mu1,mu2,sig1_upd,sig2_upd; +    float proj[2] = {0}; + +    // Compute eigenvalues of M +    T = M1 + M3; +    D = M1 * M3 - M2 * M2; +    det = sqrtf(MAX((T * T / 4.0f) - D, 0.0f)); +    eig1 = MAX((T / 2.0f) + det, 0.0f); +    eig2 = MAX((T / 2.0f) - det, 0.0f); +    sig1 = sqrtf(eig1); +    sig2 = sqrtf(eig2); + +    // Compute normalized eigenvectors +    V1 = V2 = V3 = V4 = 0.0f; + +    if(M2 != 0.0f) +    { +        v0 = M2; +        v1 = eig1 - M3; +        v2 = eig2 - M3; + +        mu1 = sqrtf(v0 * v0 + v1 * v1); +        mu2 = sqrtf(v0 * v0 + v2 * v2); + +        if(mu1 > fTiny) +        { +            V1 = v1 / mu1; +            V3 = v0 / mu1; +        } + +        if(mu2 > fTiny) +        { +            V2 = v2 / mu2; +            V4 = v0 / mu2; +        } + +    } else +    { +        if(M1 > M3) +        { +            V1 = V4 = 1.0f; +            V2 = V3 = 0.0f; + +        } else +        { +            V1 = V4 = 0.0f; +            V2 = V3 = 1.0f; +        } +    } + +    // Compute prox_p of the diagonal entries +    sig1_upd = sig2_upd = 0.0f; + +    if(p == 1) +    { +        sig1_upd = MAX(sig1 - divsigma, 0.0f); +        sig2_upd = MAX(sig2 - divsigma, 0.0f); + +    } else if(p == INFNORM) +    { +        proj[0] = sigma * fabs(sig1); +        proj[1] = sigma * fabs(sig2); + +        /*l1 projection part */ +        sum = fLarge; +        num = 0l; +        shrinkfactor = 0.0f; +        while(sum > 1.0f) +        { +            sum = 0.0f; +            num = 0; + +            for(ii = 0; ii < 2; ii++) +            { +                proj[ii] = MAX(proj[ii] - shrinkfactor, 0.0f); + +                sum += fabs(proj[ii]); +                if(proj[ii]!= 0.0f) +                    num++; +            } + +            if(num > 0) +                shrinkfactor = (sum - 1.0f) / num; +            else +                break; +        } +        /*l1 proj ends*/ + +        sig1_upd = sig1 - divsigma * proj[0]; +        sig2_upd = sig2 - divsigma * proj[1]; +    } + +    // Compute the diagonal entries of $\widehat{\Sigma}\Sigma^{\dagger}_0$ +    if(sig1 > fTiny) +        sig1_upd /= sig1; + +    if(sig2 > fTiny) +        sig2_upd /= sig2; + +    // Compute solution +    t[0] = sig1_upd * V1 * V1 + sig2_upd * V2 * V2; +    t[1] = sig1_upd * V1 * V3 + sig2_upd * V2 * V4; +    t[2] = sig1_upd * V3 * V3 + sig2_upd * V4 * V4; +} + + +#include "hw_sched.h" +typedef struct { +    int offY, stepY, copY; +    float *Input, *u, *u_upd, *qx, *qy, *qx_upd, *qy_upd, *gradx, *grady, *gradx_upd, *grady_upd; +    float *div, *div_upd; +    float resprimal, resdual; +    float unorm, qnorm, product; +} tnv_thread_t; + +typedef struct { +    int threads; +    tnv_thread_t *thr_ctx; +    float *InputT, *uT; +    int dimX, dimY, dimZ, padZ; +    float lambda, sigma, tau, theta; +} tnv_context_t; + +HWSched sched = NULL; +tnv_context_t tnv_ctx; + + +static int tnv_free(HWThread thr, void *hwctx, int device_id, void *data) { +    int i,j,k; +    tnv_context_t *tnv_ctx = (tnv_context_t*)data; +    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + +    free(ctx->Input); +    free(ctx->u); +    free(ctx->u_upd); +    free(ctx->qx); +    free(ctx->qy); +    free(ctx->qx_upd); +    free(ctx->qy_upd); +    free(ctx->gradx); +    free(ctx->grady); +    free(ctx->gradx_upd); +    free(ctx->grady_upd); +    free(ctx->div); +    free(ctx->div_upd); +     +    return 0; +} + +static int tnv_init(HWThread thr, void *hwctx, int device_id, void *data) { +    tnv_context_t *tnv_ctx = (tnv_context_t*)data; +    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; +     +    int dimX = tnv_ctx->dimX; +    int dimY = tnv_ctx->dimY; +    int dimZ = tnv_ctx->dimZ; +    int padZ = tnv_ctx->padZ; +    int offY = ctx->offY; +    int stepY = ctx->stepY; +     +//    printf("%i %p - %i %i %i x %i %i\n", device_id, ctx, dimX, dimY, dimZ, offY, stepY); + +    long DimTotal = (long)(dimX*stepY*padZ); +    long Dim1Total = (long)(dimX*(stepY+1)*padZ); + +    // Auxiliar vectors +    ctx->Input = memalign(64, Dim1Total * sizeof(float)); +    ctx->u = memalign(64, Dim1Total * sizeof(float)); +    ctx->u_upd = memalign(64, Dim1Total * sizeof(float)); +    ctx->qx = memalign(64, DimTotal * sizeof(float)); +    ctx->qy = memalign(64, DimTotal * sizeof(float)); +    ctx->qx_upd = memalign(64, DimTotal * sizeof(float)); +    ctx->qy_upd = memalign(64, DimTotal * sizeof(float)); +    ctx->gradx = memalign(64, DimTotal * sizeof(float)); +    ctx->grady = memalign(64, DimTotal * sizeof(float)); +    ctx->gradx_upd = memalign(64, DimTotal * sizeof(float)); +    ctx->grady_upd = memalign(64, DimTotal * sizeof(float)); +    ctx->div = memalign(64, Dim1Total * sizeof(float)); +    ctx->div_upd = malloc(Dim1Total * sizeof(float)); + +    if ((!ctx->Input)||(!ctx->u)||(!ctx->u_upd)||(!ctx->qx)||(!ctx->qy)||(!ctx->qx_upd)||(!ctx->qy_upd)||(!ctx->gradx)||(!ctx->grady)||(!ctx->gradx_upd)||(!ctx->grady_upd)||(!ctx->div)||(!ctx->div_upd)) { +        fprintf(stderr, "Error allocating memory\n"); +        exit(-1); +    } + +    return 0; +} + +static int tnv_start(HWThread thr, void *hwctx, int device_id, void *data) { +    int i,j,k; +    tnv_context_t *tnv_ctx = (tnv_context_t*)data; +    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; +     +    int dimX = tnv_ctx->dimX; +    int dimY = tnv_ctx->dimY; +    int dimZ = tnv_ctx->dimZ; +    int padZ = tnv_ctx->padZ; +    int offY = ctx->offY; +    int stepY = ctx->stepY; +    int copY = ctx->copY; +     +//    printf("%i %p - %i %i %i (%i) x %i %i\n", device_id, ctx, dimX, dimY, dimZ, padZ, offY, stepY); + +    long DimTotal = (long)(dimX*stepY*padZ); +    long Dim1Total = (long)(dimX*copY*padZ); + +    memset(ctx->u, 0, Dim1Total * sizeof(float)); +    memset(ctx->qx, 0, DimTotal * sizeof(float)); +    memset(ctx->qy, 0, DimTotal * sizeof(float)); +    memset(ctx->gradx, 0, DimTotal * sizeof(float)); +    memset(ctx->grady, 0, DimTotal * sizeof(float)); +    memset(ctx->div, 0, Dim1Total * sizeof(float)); +     +    for(k=0; k<dimZ; k++) { +        for(j=0; j<copY; j++) { +            for(i=0; i<dimX; i++) { +                ctx->Input[j * dimX * padZ + i * padZ + k] =  tnv_ctx->InputT[k * dimX * dimY + (j + offY) * dimX + i]; +                ctx->u[j * dimX * padZ + i * padZ + k] =  tnv_ctx->uT[k * dimX * dimY + (j + offY) * dimX + i]; +            } +        } +    } + +    return 0; +} + +static int tnv_finish(HWThread thr, void *hwctx, int device_id, void *data) { +    int i,j,k; +    tnv_context_t *tnv_ctx = (tnv_context_t*)data; +    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + +    int dimX = tnv_ctx->dimX; +    int dimY = tnv_ctx->dimY; +    int dimZ = tnv_ctx->dimZ; +    int padZ = tnv_ctx->padZ; +    int offY = ctx->offY; +    int stepY = ctx->stepY; +    int copY = ctx->copY; + +    for(k=0; k<dimZ; k++) { +        for(j=0; j<stepY; j++) { +            for(i=0; i<dimX; i++) { +                tnv_ctx->uT[k * dimX * dimY + (j + offY) * dimX + i] = ctx->u[j * dimX * padZ + i * padZ + k]; +            } +        } +    } +     +    return 0; +} + + +static int tnv_copy(HWThread thr, void *hwctx, int device_id, void *data) { +    int i,j,k; +    tnv_context_t *tnv_ctx = (tnv_context_t*)data; +    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; +     +    int dimX = tnv_ctx->dimX; +    int dimY = tnv_ctx->dimY; +    int dimZ = tnv_ctx->dimZ; +    int stepY = ctx->stepY; +    int copY = ctx->copY; +    int padZ = tnv_ctx->padZ; +    long DimTotal = (long)(dimX*stepY*padZ); +    long Dim1Total = (long)(dimX*copY*padZ); + +    // Auxiliar vectors +    memcpy(ctx->u, ctx->u_upd, Dim1Total * sizeof(float)); +    memcpy(ctx->qx, ctx->qx_upd, DimTotal * sizeof(float)); +    memcpy(ctx->qy, ctx->qy_upd, DimTotal * sizeof(float)); +    memcpy(ctx->gradx, ctx->gradx_upd, DimTotal * sizeof(float)); +    memcpy(ctx->grady, ctx->grady_upd, DimTotal * sizeof(float)); +    memcpy(ctx->div, ctx->div_upd, Dim1Total * sizeof(float)); + +    return 0; +} + +static int tnv_restore(HWThread thr, void *hwctx, int device_id, void *data) { +    int i,j,k; +    tnv_context_t *tnv_ctx = (tnv_context_t*)data; +    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; +     +    int dimX = tnv_ctx->dimX; +    int dimY = tnv_ctx->dimY; +    int dimZ = tnv_ctx->dimZ; +    int stepY = ctx->stepY; +    int copY = ctx->copY; +    int padZ = tnv_ctx->padZ; +    long DimTotal = (long)(dimX*stepY*padZ); +    long Dim1Total = (long)(dimX*copY*padZ); + +    // Auxiliar vectors +    memcpy(ctx->u_upd, ctx->u, Dim1Total * sizeof(float)); +    memcpy(ctx->qx_upd, ctx->qx, DimTotal * sizeof(float)); +    memcpy(ctx->qy_upd, ctx->qy, DimTotal * sizeof(float)); +    memcpy(ctx->gradx_upd, ctx->gradx, DimTotal * sizeof(float)); +    memcpy(ctx->grady_upd, ctx->grady, DimTotal * sizeof(float)); +    memcpy(ctx->div_upd, ctx->div, Dim1Total * sizeof(float)); + +    return 0; +} + + +static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) { +    long i, j, k, l; +    long i1, i2, j1, j2; + +    tnv_context_t *tnv_ctx = (tnv_context_t*)data; +    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; +     +    int dimX = tnv_ctx->dimX; +    int dimY = tnv_ctx->dimY; +    int dimZ = tnv_ctx->dimZ; +    int padZ = tnv_ctx->padZ; +    int offY = ctx->offY; +    int stepY = ctx->stepY; +    int copY = ctx->copY; + +    float *Input = ctx->Input; +    float *u = ctx->u; +    float *u_upd = ctx->u_upd; +    float *qx = ctx->qx; +    float *qy = ctx->qy; +    float *qx_upd = ctx->qx_upd; +    float *qy_upd = ctx->qy_upd; +    float *gradx = ctx->gradx; +    float *grady = ctx->grady; +    float *gradx_upd = ctx->gradx_upd; +    float *grady_upd = ctx->grady_upd; +    float *div = ctx->div; +    float *div_upd = ctx->div_upd; + +    long p = 1l; +    long q = 1l; +    long r = 0l; + +    float lambda = tnv_ctx->lambda; +    float sigma = tnv_ctx->sigma; +    float tau = tnv_ctx->tau; +    float theta = tnv_ctx->theta; +     +    float taulambda = tau * lambda; +    float divtau = 1.0f / tau; +    float divsigma = 1.0f / sigma; +    float theta1 = 1.0f + theta; +    float constant = 1.0f + taulambda; + +    float resprimal = 0.0f; +    float resdual1 = 0.0f; +    float resdual2 = 0.0f; +    float product = 0.0f; +    float unorm = 0.0f; +    float qnorm = 0.0f; + +    float udiff[dimZ] __attribute__((aligned(64))); +    float qxdiff __attribute__((aligned(64))); +    float qydiff __attribute__((aligned(64))); +    float divdiff __attribute__((aligned(64))); +    float gradxdiff[dimZ] __attribute__((aligned(64))); +    float gradydiff[dimZ] __attribute__((aligned(64))); + + +    j = 0; { +#       define TNV_LOOP_FIRST_J +        i = 0; { +#           define TNV_LOOP_FIRST_I +#           include "TNV_core_backtrack_loop.h" +#           undef TNV_LOOP_FIRST_I +        } +        for(i = 1; i < (dimX - 1); i++) { +#           include "TNV_core_backtrack_loop.h" +        } +        i = dimX - 1; { +#           define TNV_LOOP_LAST_I +#           include "TNV_core_backtrack_loop.h" +#           undef TNV_LOOP_LAST_I +        } +#       undef TNV_LOOP_FIRST_J +    } + + + +    for(int j = 1; j < (copY - 1); j++) { +        i = 0; { +#           define TNV_LOOP_FIRST_I +#           include "TNV_core_backtrack_loop.h" +#           undef TNV_LOOP_FIRST_I +        } +    } + +    for(int j1 = 1; j1 < (copY - 1); j1 += BLOCK) { +        for(int i1 = 1; i1 < (dimX - 1); i1 += BLOCK) { +            for(int j2 = 0; j2 < BLOCK; j2 ++) { +                j = j1 + j2; +                for(int i2 = 0; i2 < BLOCK; i2++) { +                    i = i1 + i2; +                     +                    if (i == (dimX - 1)) break; +                    if (j == (copY - 1)) { j2 = BLOCK; break; } +#           include "TNV_core_backtrack_loop.h" +                }    +            } +        } // i + +    } + +    for(int j = 1; j < (copY - 1); j++) { +        i = dimX - 1; { +#           define TNV_LOOP_LAST_I +#           include "TNV_core_backtrack_loop.h" +#           undef TNV_LOOP_LAST_I +        } +    } + + + +    for (j = copY - 1; j < stepY; j++) { +#       define TNV_LOOP_LAST_J +        i = 0; { +#           define TNV_LOOP_FIRST_I +#           include "TNV_core_backtrack_loop.h" +#           undef TNV_LOOP_FIRST_I +        } +        for(i = 1; i < (dimX - 1); i++) { +#           include "TNV_core_backtrack_loop.h" +        } +        i = dimX - 1; { +#           define TNV_LOOP_LAST_I +#           include "TNV_core_backtrack_loop.h" +#           undef TNV_LOOP_LAST_I +        } +#       undef TNV_LOOP_LAST_J +    } + +     +    ctx->resprimal = resprimal; +    ctx->resdual = resdual1 + resdual2; +    ctx->product = product; +    ctx->unorm = unorm; +    ctx->qnorm = qnorm; + +    return 0; +} + +static void TNV_CPU_init(float *InputT, float *uT, int dimX, int dimY, int dimZ) { +    int i, off, size, err; + +    if (sched) return; + +    tnv_ctx.dimX = dimX; +    tnv_ctx.dimY = dimY; +    tnv_ctx.dimZ = dimZ; +        // Padding seems actually slower +    tnv_ctx.padZ = dimZ; +//    tnv_ctx.padZ = 16 * ((dimZ / 16) + ((dimZ % 16)?1:0)); +     +    hw_sched_init(); + +    int threads = hw_sched_get_cpu_count(); +    if (threads > dimY) threads = dimY/2; + +    int step = dimY / threads; +    int extra = dimY % threads; + +    tnv_ctx.threads = threads; +    tnv_ctx.thr_ctx = (tnv_thread_t*)calloc(threads, sizeof(tnv_thread_t)); +    for (i = 0, off = 0; i < threads; i++, off += size) { +        tnv_thread_t *ctx = tnv_ctx.thr_ctx + i; +        size = step + ((i < extra)?1:0); + +        ctx->offY = off; +        ctx->stepY = size; + +        if (i == (threads-1)) ctx->copY = ctx->stepY; +        else ctx->copY = ctx->stepY + 1; +    } + +    sched = hw_sched_create(threads); +    if (!sched) { fprintf(stderr, "Error creating threads\n"); exit(-1); } + +    err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_init); +    if (!err) err = hw_sched_wait_task(sched); +    if (err) { fprintf(stderr, "Error %i scheduling init threads", err); exit(-1); } +} + + + +/* + * C-OMP implementation of Total Nuclear Variation regularisation model (2D + channels) [1] + * The code is modified from the implementation by Joan Duran <joan.duran@uib.es> see + * "denoisingPDHG_ipol.cpp" in Joans Collaborative Total Variation package + * + * Input Parameters: + * 1. Noisy volume of 2D + channel dimension, i.e. 3D volume + * 2. lambda - regularisation parameter + * 3. Number of iterations [OPTIONAL parameter] + * 4. eplsilon - tolerance constant [OPTIONAL parameter] + * 5. print information: 0 (off) or 1 (on)  [OPTIONAL parameter] + * + * Output: + * 1. Filtered/regularized image (u) + * + * [1]. Duran, J., Moeller, M., Sbert, C. and Cremers, D., 2016. Collaborative total variation: a general framework for vectorial TV models. SIAM Journal on Imaging Sciences, 9(1), pp.116-151. + */ + +float TNV_CPU_main(float *InputT, float *uT, float lambda, int maxIter, float tol, int dimX, int dimY, int dimZ) +{ +    int err; +    int iter; +    int i,j,k,l,m; + +    lambda = 1.0f/(2.0f*lambda); +    tnv_ctx.lambda = lambda; + +    // PDHG algorithm parameters +    float tau = 0.5f; +    float sigma = 0.5f; +    float theta = 1.0f; + +    // Backtracking parameters +    float s = 1.0f; +    float gamma = 0.75f; +    float beta = 0.95f; +    float alpha0 = 0.2f; +    float alpha = alpha0; +    float delta = 1.5f; +    float eta = 0.95f; + +    TNV_CPU_init(InputT, uT, dimX, dimY, dimZ); + +    tnv_ctx.InputT = InputT; +    tnv_ctx.uT = uT; +     +    int padZ = tnv_ctx.padZ; + +    err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_start); +    if (!err) err = hw_sched_wait_task(sched); +    if (err) { fprintf(stderr, "Error %i scheduling start threads", err); exit(-1); } + + +    // Apply Primal-Dual Hybrid Gradient scheme +    float residual = fLarge; +    for(iter = 0; iter < maxIter; iter++)   { +        float resprimal = 0.0f; +        float resdual = 0.0f; +        float product = 0.0f; +        float unorm = 0.0f; +        float qnorm = 0.0f; + +        float divtau = 1.0f / tau; + +        tnv_ctx.sigma = sigma; +        tnv_ctx.tau = tau; +        tnv_ctx.theta = theta; + +        err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_step); +        if (!err) err = hw_sched_wait_task(sched); +        if (err) { fprintf(stderr, "Error %i scheduling tnv threads", err); exit(-1); } + +            // border regions +        for (j = 1; j < tnv_ctx.threads; j++) { +            tnv_thread_t *ctx0 = tnv_ctx.thr_ctx + (j - 1); +            tnv_thread_t *ctx = tnv_ctx.thr_ctx + j; + +            m = ctx0->stepY  * dimX * padZ; +            for(i = 0; i < dimX; i++) { +                for(k = 0; k < dimZ; k++) { +                    int l = i * padZ + k; +                         +                    ctx->div_upd[l] -= ctx0->qy_upd[m - dimX * padZ + l]; +                    ctx0->div_upd[m + l] = ctx->div_upd[l]; +                    ctx0->u_upd[m + l] = ctx->u_upd[l]; + +                    float divdiff = ctx->div[l] - ctx->div_upd[l]; +                    float udiff = ctx->u[l] - ctx->u_upd[l]; +                    resprimal += fabs(divtau * udiff + divdiff);  +                } +            } +        } + +        { +            tnv_thread_t *ctx = tnv_ctx.thr_ctx + 0; +            for(i = 0; i < dimX; i++) { +                for(k = 0; k < dimZ; k++) { +                    int l = i * padZ + k; + +                    float divdiff = ctx->div[l] - ctx->div_upd[l]; +                    float udiff = ctx->u[l] - ctx->u_upd[l]; +                    resprimal += fabs(divtau * udiff + divdiff);  +                } +            } +        } + +        for (j = 0; j < tnv_ctx.threads; j++) { +            tnv_thread_t *ctx = tnv_ctx.thr_ctx + j; +            resprimal += ctx->resprimal; +            resdual += ctx->resdual; +            product += ctx->product; +            unorm += ctx->unorm; +            qnorm += ctx->qnorm; +        }  + +        residual = (resprimal + resdual) / ((float) (dimX*dimY*dimZ)); +        float b = (2.0f * tau * sigma * product) / (gamma * sigma * unorm + gamma * tau * qnorm); +        float dual_dot_delta = resdual * s * delta; +        float dual_div_delta = (resdual * s) / delta; +        printf("resprimal: %f, resdual: %f, b: %f (product: %f, unorm: %f, qnorm: %f)\n", resprimal, resdual, b, product, unorm, qnorm); + + +        if(b > 1) { +            // Decrease step-sizes to fit balancing principle +            tau = (beta * tau) / b; +            sigma = (beta * sigma) / b; +            alpha = alpha0; + +            err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_restore); +            if (!err) err = hw_sched_wait_task(sched); +            if (err) { fprintf(stderr, "Error %i scheduling restore threads", err); exit(-1); } +        } else { +            err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_copy); +            if (!err) err = hw_sched_wait_task(sched); +            if (err) { fprintf(stderr, "Error %i scheduling copy threads", err); exit(-1); } +         +            if(resprimal > dual_dot_delta) { +                    // Increase primal step-size and decrease dual step-size +                tau = tau / (1.0f - alpha); +                sigma = sigma * (1.0f - alpha); +                alpha = alpha * eta; +            } else if(resprimal < dual_div_delta) { +                    // Decrease primal step-size and increase dual step-size +                tau = tau * (1.0f - alpha); +                sigma = sigma / (1.0f - alpha); +                alpha = alpha * eta; +            } +        } + +        if (residual < tol) break; +    } + +    err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_finish); +    if (!err) err = hw_sched_wait_task(sched); +    if (err) { fprintf(stderr, "Error %i scheduling finish threads", err); exit(-1); } + + +    printf("Iterations stopped at %i with the residual %f \n", iter, residual); +    printf("Return: %f\n", *uT); + +    return *uT; +} diff --git a/src/Core/performance_CPU/TNV_core.c.v27 b/src/Core/performance_CPU/TNV_core.c.v27 new file mode 100755 index 0000000..dce414a --- /dev/null +++ b/src/Core/performance_CPU/TNV_core.c.v27 @@ -0,0 +1,650 @@ +/* + * This work is part of the Core Imaging Library developed by + * Visual Analytics and Imaging System Group of the Science Technology + * Facilities Council, STFC + * + * Copyright 2017 Daniil Kazantsev + * Copyright 2017 Srikanth Nagella, Edoardo Pasca + *  + * Copyriht 2020 Suren A. Chlingaryan + * Optimized version with 1/3 of memory consumption and ~10x performance + * This version is not able to perform back-track except during first iterations + * But warning would be printed if backtracking is required and slower version (TNV_core_backtrack.c)  + * could be executed instead. It still better than original with 1/2 of memory consumption and 4x performance gain + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "TNV_core.h" + +#define BLOCK 32 +#define min(a,b) (((a)<(b))?(a):(b)) + +inline void coefF(float *t, float M1, float M2, float M3, float sigma, int p, int q, int r) { +    int ii, num; +    float divsigma = 1.0f / sigma; +    float sum, shrinkfactor; +    float T,D,det,eig1,eig2,sig1,sig2,V1, V2, V3, V4, v0,v1,v2, mu1,mu2,sig1_upd,sig2_upd; +    float proj[2] = {0}; + +    // Compute eigenvalues of M +    T = M1 + M3; +    D = M1 * M3 - M2 * M2; +    det = sqrtf(MAX((T * T / 4.0f) - D, 0.0f)); +    eig1 = MAX((T / 2.0f) + det, 0.0f); +    eig2 = MAX((T / 2.0f) - det, 0.0f); +    sig1 = sqrtf(eig1); +    sig2 = sqrtf(eig2); + +    // Compute normalized eigenvectors +    V1 = V2 = V3 = V4 = 0.0f; + +    if(M2 != 0.0f) +    { +        v0 = M2; +        v1 = eig1 - M3; +        v2 = eig2 - M3; + +        mu1 = sqrtf(v0 * v0 + v1 * v1); +        mu2 = sqrtf(v0 * v0 + v2 * v2); + +        if(mu1 > fTiny) +        { +            V1 = v1 / mu1; +            V3 = v0 / mu1; +        } + +        if(mu2 > fTiny) +        { +            V2 = v2 / mu2; +            V4 = v0 / mu2; +        } + +    } else +    { +        if(M1 > M3) +        { +            V1 = V4 = 1.0f; +            V2 = V3 = 0.0f; + +        } else +        { +            V1 = V4 = 0.0f; +            V2 = V3 = 1.0f; +        } +    } + +    // Compute prox_p of the diagonal entries +    sig1_upd = sig2_upd = 0.0f; + +    if(p == 1) +    { +        sig1_upd = MAX(sig1 - divsigma, 0.0f); +        sig2_upd = MAX(sig2 - divsigma, 0.0f); + +    } else if(p == INFNORM) +    { +        proj[0] = sigma * fabs(sig1); +        proj[1] = sigma * fabs(sig2); + +        /*l1 projection part */ +        sum = fLarge; +        num = 0l; +        shrinkfactor = 0.0f; +        while(sum > 1.0f) +        { +            sum = 0.0f; +            num = 0; + +            for(ii = 0; ii < 2; ii++) +            { +                proj[ii] = MAX(proj[ii] - shrinkfactor, 0.0f); + +                sum += fabs(proj[ii]); +                if(proj[ii]!= 0.0f) +                    num++; +            } + +            if(num > 0) +                shrinkfactor = (sum - 1.0f) / num; +            else +                break; +        } +        /*l1 proj ends*/ + +        sig1_upd = sig1 - divsigma * proj[0]; +        sig2_upd = sig2 - divsigma * proj[1]; +    } + +    // Compute the diagonal entries of $\widehat{\Sigma}\Sigma^{\dagger}_0$ +    if(sig1 > fTiny) +        sig1_upd /= sig1; + +    if(sig2 > fTiny) +        sig2_upd /= sig2; + +    // Compute solution +    t[0] = sig1_upd * V1 * V1 + sig2_upd * V2 * V2; +    t[1] = sig1_upd * V1 * V3 + sig2_upd * V2 * V4; +    t[2] = sig1_upd * V3 * V3 + sig2_upd * V4 * V4; +} + + +#include "hw_sched.h" +typedef struct { +    int offY, stepY, copY; +    float *Input, *u, *qx, *qy, *gradx, *grady, *div; +    float *div0, *udiff0; +    float resprimal, resdual; +    float unorm, qnorm, product; +} tnv_thread_t; + +typedef struct { +    int threads; +    tnv_thread_t *thr_ctx; +    float *InputT, *uT; +    int dimX, dimY, dimZ, padZ; +    float lambda, sigma, tau, theta; +} tnv_context_t; + +HWSched sched = NULL; +tnv_context_t tnv_ctx; + + +static int tnv_free(HWThread thr, void *hwctx, int device_id, void *data) { +    int i,j,k; +    tnv_context_t *tnv_ctx = (tnv_context_t*)data; +    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + +    free(ctx->Input); +    free(ctx->u); +    free(ctx->qx); +    free(ctx->qy); +    free(ctx->gradx); +    free(ctx->grady); +    free(ctx->div); +     +    free(ctx->div0); +    free(ctx->udiff0); +     +    return 0; +} + +static int tnv_init(HWThread thr, void *hwctx, int device_id, void *data) { +    tnv_context_t *tnv_ctx = (tnv_context_t*)data; +    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; +     +    int dimX = tnv_ctx->dimX; +    int dimY = tnv_ctx->dimY; +    int dimZ = tnv_ctx->dimZ; +    int padZ = tnv_ctx->padZ; +    int offY = ctx->offY; +    int stepY = ctx->stepY; +     +//    printf("%i %p - %i %i %i x %i %i\n", device_id, ctx, dimX, dimY, dimZ, offY, stepY); + +    long DimTotal = (long)(dimX*stepY*padZ); +    long Dim1Total = (long)(dimX*(stepY+1)*padZ); +    long DimRow = (long)(dimX * padZ); + +    // Auxiliar vectors +    ctx->Input = memalign(64, Dim1Total * sizeof(float)); +    ctx->u = memalign(64, Dim1Total * sizeof(float)); +    ctx->qx = memalign(64, DimTotal * sizeof(float)); +    ctx->qy = memalign(64, DimTotal * sizeof(float)); +    ctx->gradx = memalign(64, DimTotal * sizeof(float)); +    ctx->grady = memalign(64, DimTotal * sizeof(float)); +    ctx->div = memalign(64, Dim1Total * sizeof(float)); + +    ctx->div0 = memalign(64, DimRow * sizeof(float)); +    ctx->udiff0 = memalign(64, DimRow * sizeof(float)); + +    if ((!ctx->Input)||(!ctx->u)||(!ctx->qx)||(!ctx->qy)||(!ctx->gradx)||(!ctx->grady)||(!ctx->div)||(!ctx->div0)||(!ctx->udiff0)) { +        fprintf(stderr, "Error allocating memory\n"); +        exit(-1); +    } + +    return 0; +} + +static int tnv_start(HWThread thr, void *hwctx, int device_id, void *data) { +    int i,j,k; +    tnv_context_t *tnv_ctx = (tnv_context_t*)data; +    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; +     +    int dimX = tnv_ctx->dimX; +    int dimY = tnv_ctx->dimY; +    int dimZ = tnv_ctx->dimZ; +    int padZ = tnv_ctx->padZ; +    int offY = ctx->offY; +    int stepY = ctx->stepY; +    int copY = ctx->copY; +     +//    printf("%i %p - %i %i %i (%i) x %i %i\n", device_id, ctx, dimX, dimY, dimZ, padZ, offY, stepY); + +    long DimTotal = (long)(dimX*stepY*padZ); +    long Dim1Total = (long)(dimX*copY*padZ); + +    memset(ctx->u, 0, Dim1Total * sizeof(float)); +    memset(ctx->qx, 0, DimTotal * sizeof(float)); +    memset(ctx->qy, 0, DimTotal * sizeof(float)); +    memset(ctx->gradx, 0, DimTotal * sizeof(float)); +    memset(ctx->grady, 0, DimTotal * sizeof(float)); +    memset(ctx->div, 0, Dim1Total * sizeof(float)); +     +    for(k=0; k<dimZ; k++) { +        for(j=0; j<copY; j++) { +            for(i=0; i<dimX; i++) { +                ctx->Input[j * dimX * padZ + i * padZ + k] =  tnv_ctx->InputT[k * dimX * dimY + (j + offY) * dimX + i]; +                ctx->u[j * dimX * padZ + i * padZ + k] =  tnv_ctx->uT[k * dimX * dimY + (j + offY) * dimX + i]; +            } +        } +    } + +    return 0; +} + +static int tnv_finish(HWThread thr, void *hwctx, int device_id, void *data) { +    int i,j,k; +    tnv_context_t *tnv_ctx = (tnv_context_t*)data; +    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + +    int dimX = tnv_ctx->dimX; +    int dimY = tnv_ctx->dimY; +    int dimZ = tnv_ctx->dimZ; +    int padZ = tnv_ctx->padZ; +    int offY = ctx->offY; +    int stepY = ctx->stepY; +    int copY = ctx->copY; + +    for(k=0; k<dimZ; k++) { +        for(j=0; j<stepY; j++) { +            for(i=0; i<dimX; i++) { +                tnv_ctx->uT[k * dimX * dimY + (j + offY) * dimX + i] = ctx->u[j * dimX * padZ + i * padZ + k]; +            } +        } +    } +     +    return 0; +} + + +static int tnv_restore(HWThread thr, void *hwctx, int device_id, void *data) { +    int i,j,k; +    tnv_context_t *tnv_ctx = (tnv_context_t*)data; +    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; +     +    int dimX = tnv_ctx->dimX; +    int dimY = tnv_ctx->dimY; +    int dimZ = tnv_ctx->dimZ; +    int stepY = ctx->stepY; +    int copY = ctx->copY; +    int padZ = tnv_ctx->padZ; +    long DimTotal = (long)(dimX*stepY*padZ); +    long Dim1Total = (long)(dimX*copY*padZ); + +    memset(ctx->u, 0, Dim1Total * sizeof(float)); +    memset(ctx->qx, 0, DimTotal * sizeof(float)); +    memset(ctx->qy, 0, DimTotal * sizeof(float)); +    memset(ctx->gradx, 0, DimTotal * sizeof(float)); +    memset(ctx->grady, 0, DimTotal * sizeof(float)); +    memset(ctx->div, 0, Dim1Total * sizeof(float)); + +    return 0; +} + + +static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) { +    long i, j, k, l; + +    tnv_context_t *tnv_ctx = (tnv_context_t*)data; +    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; +     +    int dimX = tnv_ctx->dimX; +    int dimY = tnv_ctx->dimY; +    int dimZ = tnv_ctx->dimZ; +    int padZ = tnv_ctx->padZ; +    int offY = ctx->offY; +    int stepY = ctx->stepY; +    int copY = ctx->copY; + +    float *Input = ctx->Input; +    float *u = ctx->u; +    float *qx = ctx->qx; +    float *qy = ctx->qy; +    float *gradx = ctx->gradx; +    float *grady = ctx->grady; +    float *div = ctx->div; + +    long p = 1l; +    long q = 1l; +    long r = 0l; + +    float lambda = tnv_ctx->lambda; +    float sigma = tnv_ctx->sigma; +    float tau = tnv_ctx->tau; +    float theta = tnv_ctx->theta; +     +    float taulambda = tau * lambda; +    float divtau = 1.0f / tau; +    float divsigma = 1.0f / sigma; +    float theta1 = 1.0f + theta; +    float constant = 1.0f + taulambda; + +    float resprimal = 0.0f; +    float resdual = 0.0f; +    float product = 0.0f; +    float unorm = 0.0f; +    float qnorm = 0.0f; + +    float qxdiff; +    float qydiff; +    float divdiff; +    float gradxdiff[dimZ] __attribute__((aligned(64))); +    float gradydiff[dimZ] __attribute__((aligned(64))); +    float ubarx[dimZ] __attribute__((aligned(64))); +    float ubary[dimZ] __attribute__((aligned(64))); +    float udiff[dimZ] __attribute__((aligned(64))); + + +    for(i=0; i < dimX; i++) { +        for(k = 0; k < dimZ; k++) {  +            int l = i * padZ + k; +            float u_upd = (u[l] + tau * div[l] + taulambda * Input[l])/constant; +            float udiff = u[l] - u_upd; +            ctx->udiff0[l] = udiff; +            ctx->div0[l] = div[l]; +        } +    } + +    for(int j1 = 0; j1 < stepY; j1 += BLOCK) { +     for(int i1 = 0; i1 < dimX; i1 += BLOCK) { +      for(int j2 = 0; j2 < BLOCK; j2++) { +        j = j1 + j2; +        for(int i2 = 0; i2 < BLOCK; i2++) { +            float t[3]; +            float M1 = 0.0f, M2 = 0.0f, M3 = 0.0f; + +            i = i1 + i2; +            if (i == dimX) break; +            if (j == stepY) { j2 = BLOCK; break; } +            l = (j * dimX  + i) * padZ; + +#pragma vector aligned +#pragma GCC ivdep  +            for(k = 0; k < dimZ; k++) { +                float u_upd = (u[l + k] + tau * div[l + k] + taulambda * Input[l + k]) / constant; +                udiff[k] = u[l + k] - u_upd; +                u[l + k] = u_upd; + +                float gradx_upd = (i == (dimX - 1))?0:((u[l + k + padZ] + tau * div[l + k + padZ] + taulambda * Input[l + k + padZ]) / constant - u_upd); +                float grady_upd = (j == (copY - 1))?0:((u[l + k + dimX*padZ] + tau * div[l + k + dimX*padZ] + taulambda * Input[l + k + dimX*padZ]) / constant - u_upd); +                gradxdiff[k] = gradx[l + k] - gradx_upd; +                gradydiff[k] = grady[l + k] - grady_upd; +                gradx[l + k] = gradx_upd; +                grady[l + k] = grady_upd; +                 +                ubarx[k] = gradx_upd - theta * gradxdiff[k]; +                ubary[k] = grady_upd - theta * gradydiff[k]; + +                float vx = ubarx[k] + divsigma * qx[l + k]; +                float vy = ubary[k] + divsigma * qy[l + k]; + +                M1 += (vx * vx); M2 += (vx * vy); M3 += (vy * vy); +            } + +            coefF(t, M1, M2, M3, sigma, p, q, r); +             +#pragma vector aligned +#pragma GCC ivdep  +            for(k = 0; k < dimZ; k++) { +                float vx = ubarx[k] + divsigma * qx[l + k]; +                float vy = ubary[k] + divsigma * qy[l + k]; +                float gx_upd = vx * t[0] + vy * t[1]; +                float gy_upd = vx * t[1] + vy * t[2]; + +                qxdiff = sigma * (ubarx[k] - gx_upd); +                qydiff = sigma * (ubary[k] - gy_upd); +                 +                qx[l + k] += qxdiff; +                qy[l + k] += qydiff; + +                unorm += (udiff[k] * udiff[k]); +                qnorm += (qxdiff * qxdiff + qydiff * qydiff); + +                float div_upd = 0; +                div_upd -= (i > 0)?qx[l + k - padZ]:0; +                div_upd -= (j > 0)?qy[l + k - dimX*padZ]:0; +                div_upd += (i < (dimX-1))?qx[l + k]:0; +                div_upd += (j < (copY-1))?qy[l + k]:0; +                divdiff = div[l + k] - div_upd;   +                div[l + k] = div_upd; + +                resprimal +=  ((offY == 0)||(j > 0))?fabs(divtau * udiff[k] + divdiff):0;  +                resdual += fabs(divsigma * qxdiff + gradxdiff[k]); +                resdual += fabs(divsigma * qydiff + gradydiff[k]); +                product -= (gradxdiff[k] * qxdiff + gradydiff[k] * qydiff); +            } +             +        } // i +       } // j +     } // i +    } // j + + +    ctx->resprimal = resprimal; +    ctx->resdual = resdual; +    ctx->product = product; +    ctx->unorm = unorm; +    ctx->qnorm = qnorm; + +    return 0; +} + +static void TNV_CPU_init(float *InputT, float *uT, int dimX, int dimY, int dimZ) { +    int i, off, size, err; + +    if (sched) return; + +    tnv_ctx.dimX = dimX; +    tnv_ctx.dimY = dimY; +    tnv_ctx.dimZ = dimZ; +        // Padding seems actually slower +//    tnv_ctx.padZ = dimZ; +//    tnv_ctx.padZ = 4 * ((dimZ / 4) + ((dimZ % 4)?1:0)); +    tnv_ctx.padZ = 16 * ((dimZ / 16) + ((dimZ % 16)?1:0)); +     +    hw_sched_init(); + +    int threads = hw_sched_get_cpu_count(); +    if (threads > dimY) threads = dimY/2; + +    int step = dimY / threads; +    int extra = dimY % threads; + +    tnv_ctx.threads = threads; +    tnv_ctx.thr_ctx = (tnv_thread_t*)calloc(threads, sizeof(tnv_thread_t)); +    for (i = 0, off = 0; i < threads; i++, off += size) { +        tnv_thread_t *ctx = tnv_ctx.thr_ctx + i; +        size = step + ((i < extra)?1:0); + +        ctx->offY = off; +        ctx->stepY = size; + +        if (i == (threads-1)) ctx->copY = ctx->stepY; +        else ctx->copY = ctx->stepY + 1; +    } + +    sched = hw_sched_create(threads); +    if (!sched) { fprintf(stderr, "Error creating threads\n"); exit(-1); } + +    err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_init); +    if (!err) err = hw_sched_wait_task(sched); +    if (err) { fprintf(stderr, "Error %i scheduling init threads", err); exit(-1); } +} + + + +/* + * C-OMP implementation of Total Nuclear Variation regularisation model (2D + channels) [1] + * The code is modified from the implementation by Joan Duran <joan.duran@uib.es> see + * "denoisingPDHG_ipol.cpp" in Joans Collaborative Total Variation package + * + * Input Parameters: + * 1. Noisy volume of 2D + channel dimension, i.e. 3D volume + * 2. lambda - regularisation parameter + * 3. Number of iterations [OPTIONAL parameter] + * 4. eplsilon - tolerance constant [OPTIONAL parameter] + * 5. print information: 0 (off) or 1 (on)  [OPTIONAL parameter] + * + * Output: + * 1. Filtered/regularized image (u) + * + * [1]. Duran, J., Moeller, M., Sbert, C. and Cremers, D., 2016. Collaborative total variation: a general framework for vectorial TV models. SIAM Journal on Imaging Sciences, 9(1), pp.116-151. + */ + +float TNV_CPU_main(float *InputT, float *uT, float lambda, int maxIter, float tol, int dimX, int dimY, int dimZ) +{ +    int err; +    int iter; +    int i,j,k,l,m; + +    lambda = 1.0f/(2.0f*lambda); +    tnv_ctx.lambda = lambda; + +    // PDHG algorithm parameters +    float tau = 0.5f; +    float sigma = 0.5f; +    float theta = 1.0f; + +    // Backtracking parameters +    float s = 1.0f; +    float gamma = 0.75f; +    float beta = 0.95f; +    float alpha0 = 0.2f; +    float alpha = alpha0; +    float delta = 1.5f; +    float eta = 0.95f; + +    TNV_CPU_init(InputT, uT, dimX, dimY, dimZ); + +    tnv_ctx.InputT = InputT; +    tnv_ctx.uT = uT; +     +    int padZ = tnv_ctx.padZ; + +    err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_start); +    if (!err) err = hw_sched_wait_task(sched); +    if (err) { fprintf(stderr, "Error %i scheduling start threads", err); exit(-1); } + + +    // Apply Primal-Dual Hybrid Gradient scheme +    float residual = fLarge; +    int started = 0; +    for(iter = 0; iter < maxIter; iter++)   { +        float resprimal = 0.0f; +        float resdual = 0.0f; +        float product = 0.0f; +        float unorm = 0.0f; +        float qnorm = 0.0f; + +        float divtau = 1.0f / tau; + +        tnv_ctx.sigma = sigma; +        tnv_ctx.tau = tau; +        tnv_ctx.theta = theta; + +        err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_step); +        if (!err) err = hw_sched_wait_task(sched); +        if (err) { fprintf(stderr, "Error %i scheduling tnv threads", err); exit(-1); } + +            // border regions +        for (j = 1; j < tnv_ctx.threads; j++) { +            tnv_thread_t *ctx0 = tnv_ctx.thr_ctx + (j - 1); +            tnv_thread_t *ctx = tnv_ctx.thr_ctx + j; + +            m = (ctx0->stepY - 1) * dimX * padZ; +            for(i = 0; i < dimX; i++) { +                for(k = 0; k < dimZ; k++) { +                    int l = i * padZ + k; +                         +                    float divdiff = ctx->div0[l] - ctx->div[l]; +                    float udiff = ctx->udiff0[l]; + +                    ctx->div[l] -= ctx0->qy[l + m]; +                    ctx0->div[m + l + dimX * padZ] = ctx->div[l]; +                    ctx0->u[m + l + dimX * padZ] = ctx->u[l]; + +                    divdiff += ctx0->qy[l + m]; +                    resprimal += fabs(divtau * udiff + divdiff);  +                } +            } +        } + +        for (j = 0; j < tnv_ctx.threads; j++) { +            tnv_thread_t *ctx = tnv_ctx.thr_ctx + j; +            resprimal += ctx->resprimal; +            resdual += ctx->resdual; +            product += ctx->product; +            unorm += ctx->unorm; +            qnorm += ctx->qnorm; +        }  + +        residual = (resprimal + resdual) / ((float) (dimX*dimY*dimZ)); +        float b = (2.0f * tau * sigma * product) / (gamma * sigma * unorm + gamma * tau * qnorm); +        float dual_dot_delta = resdual * s * delta; +        float dual_div_delta = (resdual * s) / delta; +        printf("resprimal: %f, resdual: %f, b: %f (product: %f, unorm: %f, qnorm: %f)\n", resprimal, resdual, b, product, unorm, qnorm); + + +        if(b > 1) { +             +            // Decrease step-sizes to fit balancing principle +            tau = (beta * tau) / b; +            sigma = (beta * sigma) / b; +            alpha = alpha0; + +            if (started) { +                fprintf(stderr, "\n\n\nWARNING: Back-tracking is required in the middle of iterative optimization! We CAN'T do it in the fast version. The standard TNV recommended\n\n\n"); +            } else { +                err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_restore); +                if (!err) err = hw_sched_wait_task(sched); +                if (err) { fprintf(stderr, "Error %i scheduling restore threads", err); exit(-1); } +            } +        } else { +            started = 1; +            if(resprimal > dual_dot_delta) { +                    // Increase primal step-size and decrease dual step-size +                tau = tau / (1.0f - alpha); +                sigma = sigma * (1.0f - alpha); +                alpha = alpha * eta; +            } else if(resprimal < dual_div_delta) { +                    // Decrease primal step-size and increase dual step-size +                tau = tau * (1.0f - alpha); +                sigma = sigma / (1.0f - alpha); +                alpha = alpha * eta; +            } +        } + +        if (residual < tol) break; +    } + +    err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_finish); +    if (!err) err = hw_sched_wait_task(sched); +    if (err) { fprintf(stderr, "Error %i scheduling finish threads", err); exit(-1); } + + +    printf("Iterations stopped at %i with the residual %f \n", iter, residual); +    printf("Return: %f\n", *uT); + +//    exit(-1); +    return *uT; +} diff --git a/src/Core/performance_CPU/TNV_core.c.v32 b/src/Core/performance_CPU/TNV_core.c.v32 new file mode 100755 index 0000000..fcb2f87 --- /dev/null +++ b/src/Core/performance_CPU/TNV_core.c.v32 @@ -0,0 +1,676 @@ +/* + * This work is part of the Core Imaging Library developed by + * Visual Analytics and Imaging System Group of the Science Technology + * Facilities Council, STFC + * + * Copyright 2017 Daniil Kazantsev + * Copyright 2017 Srikanth Nagella, Edoardo Pasca + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <malloc.h> +#include "TNV_core.h" + +#define min(a,b) (((a)<(b))?(a):(b)) + +inline void coefF(float *t, float M1, float M2, float M3, float sigma, int p, int q, int r) { +    int ii, num; +    float divsigma = 1.0f / sigma; +    float sum, shrinkfactor; +    float T,D,det,eig1,eig2,sig1,sig2,V1, V2, V3, V4, v0,v1,v2, mu1,mu2,sig1_upd,sig2_upd; +    float proj[2] = {0}; + +    // Compute eigenvalues of M +    T = M1 + M3; +    D = M1 * M3 - M2 * M2; +    det = sqrtf(MAX((T * T / 4.0f) - D, 0.0f)); +    eig1 = MAX((T / 2.0f) + det, 0.0f); +    eig2 = MAX((T / 2.0f) - det, 0.0f); +    sig1 = sqrtf(eig1); +    sig2 = sqrtf(eig2); + +    // Compute normalized eigenvectors +    V1 = V2 = V3 = V4 = 0.0f; + +    if(M2 != 0.0f) +    { +        v0 = M2; +        v1 = eig1 - M3; +        v2 = eig2 - M3; + +        mu1 = sqrtf(v0 * v0 + v1 * v1); +        mu2 = sqrtf(v0 * v0 + v2 * v2); + +        if(mu1 > fTiny) +        { +            V1 = v1 / mu1; +            V3 = v0 / mu1; +        } + +        if(mu2 > fTiny) +        { +            V2 = v2 / mu2; +            V4 = v0 / mu2; +        } + +    } else +    { +        if(M1 > M3) +        { +            V1 = V4 = 1.0f; +            V2 = V3 = 0.0f; + +        } else +        { +            V1 = V4 = 0.0f; +            V2 = V3 = 1.0f; +        } +    } + +    // Compute prox_p of the diagonal entries +    sig1_upd = sig2_upd = 0.0f; + +    if(p == 1) +    { +        sig1_upd = MAX(sig1 - divsigma, 0.0f); +        sig2_upd = MAX(sig2 - divsigma, 0.0f); + +    } else if(p == INFNORM) +    { +        proj[0] = sigma * fabs(sig1); +        proj[1] = sigma * fabs(sig2); + +        /*l1 projection part */ +        sum = fLarge; +        num = 0l; +        shrinkfactor = 0.0f; +        while(sum > 1.0f) +        { +            sum = 0.0f; +            num = 0; + +            for(ii = 0; ii < 2; ii++) +            { +                proj[ii] = MAX(proj[ii] - shrinkfactor, 0.0f); + +                sum += fabs(proj[ii]); +                if(proj[ii]!= 0.0f) +                    num++; +            } + +            if(num > 0) +                shrinkfactor = (sum - 1.0f) / num; +            else +                break; +        } +        /*l1 proj ends*/ + +        sig1_upd = sig1 - divsigma * proj[0]; +        sig2_upd = sig2 - divsigma * proj[1]; +    } + +    // Compute the diagonal entries of $\widehat{\Sigma}\Sigma^{\dagger}_0$ +    if(sig1 > fTiny) +        sig1_upd /= sig1; + +    if(sig2 > fTiny) +        sig2_upd /= sig2; + +    // Compute solution +    t[0] = sig1_upd * V1 * V1 + sig2_upd * V2 * V2; +    t[1] = sig1_upd * V1 * V3 + sig2_upd * V2 * V4; +    t[2] = sig1_upd * V3 * V3 + sig2_upd * V4 * V4; +} + + +#include "hw_sched.h" +typedef struct { +    int offY, stepY, copY; +    float *Input, *u, *qx, *qy, *gradx, *grady, *div; +    float *div0, *udiff0; +    float *gradxdiff, *gradydiff, *ubarx, *ubary, *udiff; +    float resprimal, resdual; +    float unorm, qnorm, product; +} tnv_thread_t; + +typedef struct { +    int threads; +    tnv_thread_t *thr_ctx; +    float *InputT, *uT; +    int dimX, dimY, dimZ, padZ; +    float lambda, sigma, tau, theta; +} tnv_context_t; + +HWSched sched = NULL; +tnv_context_t tnv_ctx; + + +static int tnv_free(HWThread thr, void *hwctx, int device_id, void *data) { +    int i,j,k; +    tnv_context_t *tnv_ctx = (tnv_context_t*)data; +    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + +    free(ctx->Input); +    free(ctx->u); +    free(ctx->qx); +    free(ctx->qy); +    free(ctx->gradx); +    free(ctx->grady); +    free(ctx->div); +     +    free(ctx->div0); +    free(ctx->udiff0); + +    free(ctx->gradxdiff);  +    free(ctx->gradydiff); +    free(ctx->ubarx); +    free(ctx->ubary); +    free(ctx->udiff); + +    return 0; +} + +static int tnv_init(HWThread thr, void *hwctx, int device_id, void *data) { +    tnv_context_t *tnv_ctx = (tnv_context_t*)data; +    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; +     +    int dimX = tnv_ctx->dimX; +    int dimY = tnv_ctx->dimY; +    int dimZ = tnv_ctx->dimZ; +    int padZ = tnv_ctx->padZ; +    int offY = ctx->offY; +    int stepY = ctx->stepY; +     +//    printf("%i %p - %i %i %i x %i %i\n", device_id, ctx, dimX, dimY, dimZ, offY, stepY); + +    long DimTotal = (long)(dimX*stepY*padZ); +    long Dim1Total = (long)(dimX*(stepY+1)*padZ); +    long DimRow = (long)(dimX * padZ); +    long DimCell = (long)(padZ); + +    // Auxiliar vectors +    ctx->Input = memalign(64, Dim1Total * sizeof(float)); +    ctx->u = memalign(64, Dim1Total * sizeof(float)); +    ctx->qx = memalign(64, DimTotal * sizeof(float)); +    ctx->qy = memalign(64, DimTotal * sizeof(float)); +    ctx->gradx = memalign(64, DimTotal * sizeof(float)); +    ctx->grady = memalign(64, DimTotal * sizeof(float)); +    ctx->div = memalign(64, Dim1Total * sizeof(float)); + +    ctx->div0 = memalign(64, DimRow * sizeof(float)); +    ctx->udiff0 = memalign(64, DimRow * sizeof(float)); + +    ctx->gradxdiff = memalign(64, DimCell * sizeof(float)); +    ctx->gradydiff = memalign(64, DimCell * sizeof(float)); +    ctx->ubarx = memalign(64, DimCell * sizeof(float)); +    ctx->ubary = memalign(64, DimCell * sizeof(float)); +    ctx->udiff = memalign(64, DimCell * sizeof(float)); +     +    if ((!ctx->Input)||(!ctx->u)||(!ctx->qx)||(!ctx->qy)||(!ctx->gradx)||(!ctx->grady)||(!ctx->div)||(!ctx->div0)||(!ctx->udiff)||(!ctx->udiff0)) { +        fprintf(stderr, "Error allocating memory\n"); +        exit(-1); +    } + +    return 0; +} + +static int tnv_start(HWThread thr, void *hwctx, int device_id, void *data) { +    int i,j,k; +    tnv_context_t *tnv_ctx = (tnv_context_t*)data; +    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; +     +    int dimX = tnv_ctx->dimX; +    int dimY = tnv_ctx->dimY; +    int dimZ = tnv_ctx->dimZ; +    int padZ = tnv_ctx->padZ; +    int offY = ctx->offY; +    int stepY = ctx->stepY; +    int copY = ctx->copY; +     +//    printf("%i %p - %i %i %i (%i) x %i %i\n", device_id, ctx, dimX, dimY, dimZ, padZ, offY, stepY); + +    long DimTotal = (long)(dimX*stepY*padZ); +    long Dim1Total = (long)(dimX*copY*padZ); + +    memset(ctx->u, 0, Dim1Total * sizeof(float)); +    memset(ctx->qx, 0, DimTotal * sizeof(float)); +    memset(ctx->qy, 0, DimTotal * sizeof(float)); +    memset(ctx->gradx, 0, DimTotal * sizeof(float)); +    memset(ctx->grady, 0, DimTotal * sizeof(float)); +    memset(ctx->div, 0, Dim1Total * sizeof(float)); +     +    for(k=0; k<dimZ; k++) { +        for(j=0; j<copY; j++) { +            for(i=0; i<dimX; i++) { +                ctx->Input[j * dimX * padZ + i * padZ + k] =  tnv_ctx->InputT[k * dimX * dimY + (j + offY) * dimX + i]; +                ctx->u[j * dimX * padZ + i * padZ + k] =  tnv_ctx->uT[k * dimX * dimY + (j + offY) * dimX + i]; +            } +        } +    } + +    return 0; +} + +static int tnv_finish(HWThread thr, void *hwctx, int device_id, void *data) { +    int i,j,k; +    tnv_context_t *tnv_ctx = (tnv_context_t*)data; +    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + +    int dimX = tnv_ctx->dimX; +    int dimY = tnv_ctx->dimY; +    int dimZ = tnv_ctx->dimZ; +    int padZ = tnv_ctx->padZ; +    int offY = ctx->offY; +    int stepY = ctx->stepY; +    int copY = ctx->copY; + +    for(k=0; k<dimZ; k++) { +        for(j=0; j<stepY; j++) { +            for(i=0; i<dimX; i++) { +                tnv_ctx->uT[k * dimX * dimY + (j + offY) * dimX + i] = ctx->u[j * dimX * padZ + i * padZ + k]; +            } +        } +    } +     +    return 0; +} + + +static int tnv_restore(HWThread thr, void *hwctx, int device_id, void *data) { +    int i,j,k; +    tnv_context_t *tnv_ctx = (tnv_context_t*)data; +    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; +     +    int dimX = tnv_ctx->dimX; +    int dimY = tnv_ctx->dimY; +    int dimZ = tnv_ctx->dimZ; +    int stepY = ctx->stepY; +    int copY = ctx->copY; +    int padZ = tnv_ctx->padZ; +    long DimTotal = (long)(dimX*stepY*padZ); +    long Dim1Total = (long)(dimX*copY*padZ); + +    memset(ctx->u, 0, Dim1Total * sizeof(float)); +    memset(ctx->qx, 0, DimTotal * sizeof(float)); +    memset(ctx->qy, 0, DimTotal * sizeof(float)); +    memset(ctx->gradx, 0, DimTotal * sizeof(float)); +    memset(ctx->grady, 0, DimTotal * sizeof(float)); +    memset(ctx->div, 0, Dim1Total * sizeof(float)); + +    return 0; +} + + +static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) { +    long i, j, k, l, m; + +    tnv_context_t *tnv_ctx = (tnv_context_t*)data; +    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; +     +    int dimX = tnv_ctx->dimX; +    int dimY = tnv_ctx->dimY; +    int dimZ = tnv_ctx->dimZ; +    int padZ = tnv_ctx->padZ; +    int offY = ctx->offY; +    int stepY = ctx->stepY; +    int copY = ctx->copY; + +    float *Input = ctx->Input; +    float *u = ctx->u; +    float *qx = ctx->qx; +    float *qy = ctx->qy; +    float *gradx = ctx->gradx; +    float *grady = ctx->grady; +    float *div = ctx->div; + +    long p = 1l; +    long q = 1l; +    long r = 0l; + +    float lambda = tnv_ctx->lambda; +    float sigma = tnv_ctx->sigma; +    float tau = tnv_ctx->tau; +    float theta = tnv_ctx->theta; +     +    float taulambda = tau * lambda; +    float divtau = 1.0f / tau; +    float divsigma = 1.0f / sigma; +    float theta1 = 1.0f + theta; +    float constant = 1.0f + taulambda; + +    float resprimal = 0.0f; +    float resdual1 = 0.0f; +    float resdual2 = 0.0f; +    float product = 0.0f; +    float unorm = 0.0f; +    float qnorm = 0.0f; + +    float qxdiff; +    float qydiff; +    float divdiff; +    float *gradxdiff = ctx->gradxdiff; +    float *gradydiff = ctx->gradydiff; +    float *ubarx = ctx->ubarx; +    float *ubary = ctx->ubary; +    float *udiff = ctx->udiff; + +    float *udiff0 = ctx->udiff0; +    float *div0 = ctx->div0; + +/* +    for(i=0; i < dimX; i++) { +        for(k = 0; k < dimZ; k++) {  +            int l = i * padZ + k; +            float u_upd = (u[l] + tau * div[l] + taulambda * Input[l])/constant; +            float udiff_val = u[l] - u_upd; +            udiff0[l] = udiff_val; +            div0[l] = div[l]; +        } +    } +*/ + + +    j = 0; { +#       define TNV_LOOP_FIRST_J +        i = 0; { +#           define TNV_LOOP_FIRST_I +#           include "TNV_core_loop.h" +#           undef TNV_LOOP_FIRST_I +        } +        for(i = 1; i < (dimX - 1); i++) { +#           include "TNV_core_loop.h" +        } +        i = dimX - 1; { +#           define TNV_LOOP_LAST_I +#           include "TNV_core_loop.h" +#           undef TNV_LOOP_LAST_I +        } +#       undef TNV_LOOP_FIRST_J +    } + + + +    for(int j = 1; j < (copY - 1); j++) { +        i = 0; { +#           define TNV_LOOP_FIRST_I +#           include "TNV_core_loop.h" +#           undef TNV_LOOP_FIRST_I +        } +    } + +#define BLOCK 32 +    for(int j1 = 1; j1 < (copY - 1); j1 += BLOCK) { +        for(int i1 = 1; i1 < (dimX - 1); i1 += BLOCK) { +            for(int j2 = 0; j2 < BLOCK; j2 ++) { +                j = j1 + j2; +                for(int i2 = 0; i2 < BLOCK; i2++) { +                    i = i1 + i2; +                     +                    if (i == (dimX - 1)) break; +                    if (j == (copY - 1)) { j2 = BLOCK; break; } +#           include "TNV_core_loop.h" +                }    +            } +        } // i + +    } + +    for(int j = 1; j < (copY - 1); j++) { +        i = dimX - 1; { +#           define TNV_LOOP_LAST_I +#           include "TNV_core_loop.h" +#           undef TNV_LOOP_LAST_I +        } +    } + + + +    for (j = copY - 1; j < stepY; j++) { +#       define TNV_LOOP_LAST_J +        i = 0; { +#           define TNV_LOOP_FIRST_I +#           include "TNV_core_loop.h" +#           undef TNV_LOOP_FIRST_I +        } +        for(i = 1; i < (dimX - 1); i++) { +#           include "TNV_core_loop.h" +        } +        i = dimX - 1; { +#           define TNV_LOOP_LAST_I +#           include "TNV_core_loop.h" +#           undef TNV_LOOP_LAST_I +        } +#       undef TNV_LOOP_LAST_J +    } + + + +    ctx->resprimal = resprimal; +    ctx->resdual = resdual1 + resdual2; +    ctx->product = product; +    ctx->unorm = unorm; +    ctx->qnorm = qnorm; + +    return 0; +} + +static void TNV_CPU_init(float *InputT, float *uT, int dimX, int dimY, int dimZ) { +    int i, off, size, err; + +    if (sched) return; + +    tnv_ctx.dimX = dimX; +    tnv_ctx.dimY = dimY; +    tnv_ctx.dimZ = dimZ; +        // Padding seems actually slower +//    tnv_ctx.padZ = dimZ; +//    tnv_ctx.padZ = 4 * ((dimZ / 4) + ((dimZ % 4)?1:0)); +    tnv_ctx.padZ = 16 * ((dimZ / 16) + ((dimZ % 16)?1:0)); +     +    hw_sched_init(); + +    int threads = hw_sched_get_cpu_count(); +    if (threads > dimY) threads = dimY/2; + +    int step = dimY / threads; +    int extra = dimY % threads; + +    tnv_ctx.threads = threads; +    tnv_ctx.thr_ctx = (tnv_thread_t*)calloc(threads, sizeof(tnv_thread_t)); +    for (i = 0, off = 0; i < threads; i++, off += size) { +        tnv_thread_t *ctx = tnv_ctx.thr_ctx + i; +        size = step + ((i < extra)?1:0); + +        ctx->offY = off; +        ctx->stepY = size; + +        if (i == (threads-1)) ctx->copY = ctx->stepY; +        else ctx->copY = ctx->stepY + 1; +    } + +    sched = hw_sched_create(threads); +    if (!sched) { fprintf(stderr, "Error creating threads\n"); exit(-1); } + +    err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_init); +    if (!err) err = hw_sched_wait_task(sched); +    if (err) { fprintf(stderr, "Error %i scheduling init threads", err); exit(-1); } +} + + + +/* + * C-OMP implementation of Total Nuclear Variation regularisation model (2D + channels) [1] + * The code is modified from the implementation by Joan Duran <joan.duran@uib.es> see + * "denoisingPDHG_ipol.cpp" in Joans Collaborative Total Variation package + * + * Input Parameters: + * 1. Noisy volume of 2D + channel dimension, i.e. 3D volume + * 2. lambda - regularisation parameter + * 3. Number of iterations [OPTIONAL parameter] + * 4. eplsilon - tolerance constant [OPTIONAL parameter] + * 5. print information: 0 (off) or 1 (on)  [OPTIONAL parameter] + * + * Output: + * 1. Filtered/regularized image (u) + * + * [1]. Duran, J., Moeller, M., Sbert, C. and Cremers, D., 2016. Collaborative total variation: a general framework for vectorial TV models. SIAM Journal on Imaging Sciences, 9(1), pp.116-151. + */ + +float TNV_CPU_main(float *InputT, float *uT, float lambda, int maxIter, float tol, int dimX, int dimY, int dimZ) +{ +    int err; +    int iter; +    int i,j,k,l,m; + +    lambda = 1.0f/(2.0f*lambda); +    tnv_ctx.lambda = lambda; + +    // PDHG algorithm parameters +    float tau = 0.5f; +    float sigma = 0.5f; +    float theta = 1.0f; + +    // Backtracking parameters +    float s = 1.0f; +    float gamma = 0.75f; +    float beta = 0.95f; +    float alpha0 = 0.2f; +    float alpha = alpha0; +    float delta = 1.5f; +    float eta = 0.95f; + +    TNV_CPU_init(InputT, uT, dimX, dimY, dimZ); + +    tnv_ctx.InputT = InputT; +    tnv_ctx.uT = uT; +     +    int padZ = tnv_ctx.padZ; + +    err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_start); +    if (!err) err = hw_sched_wait_task(sched); +    if (err) { fprintf(stderr, "Error %i scheduling start threads", err); exit(-1); } + + +    // Apply Primal-Dual Hybrid Gradient scheme +    float residual = fLarge; +    int started = 0; +    for(iter = 0; iter < maxIter; iter++)   { +        float resprimal = 0.0f; +        float resdual = 0.0f; +        float product = 0.0f; +        float unorm = 0.0f; +        float qnorm = 0.0f; + +        float divtau = 1.0f / tau; + +        tnv_ctx.sigma = sigma; +        tnv_ctx.tau = tau; +        tnv_ctx.theta = theta; + +        err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_step); +        if (!err) err = hw_sched_wait_task(sched); +        if (err) { fprintf(stderr, "Error %i scheduling tnv threads", err); exit(-1); } + +            // border regions +        for (j = 1; j < tnv_ctx.threads; j++) { +            tnv_thread_t *ctx0 = tnv_ctx.thr_ctx + (j - 1); +            tnv_thread_t *ctx = tnv_ctx.thr_ctx + j; + +            m = (ctx0->stepY - 1) * dimX * padZ; +            for(i = 0; i < dimX; i++) { +                for(k = 0; k < dimZ; k++) { +                    int l = i * padZ + k; +                         +                    float divdiff = ctx->div0[l] - ctx->div[l]; +                    float udiff = ctx->udiff0[l]; + +                    ctx->div[l] -= ctx0->qy[l + m]; +                    ctx0->div[m + l + dimX*padZ] = ctx->div[l]; +                    ctx0->u[m + l + dimX*padZ] = ctx->u[l]; +                     +                    divdiff += ctx0->qy[l + m]; +                    resprimal += fabs(divtau * udiff + divdiff);  +                } +            } +        } + +        { +            tnv_thread_t *ctx = tnv_ctx.thr_ctx + 0; +            for(i = 0; i < dimX; i++) { +                for(k = 0; k < dimZ; k++) { +                    int l = i * padZ + k; +                         +                    float divdiff = ctx->div0[l] - ctx->div[l]; +                    float udiff = ctx->udiff0[l]; +                    resprimal += fabs(divtau * udiff + divdiff);  +                } +            } +        } + +        for (j = 0; j < tnv_ctx.threads; j++) { +            tnv_thread_t *ctx = tnv_ctx.thr_ctx + j; +            resprimal += ctx->resprimal; +            resdual += ctx->resdual; +            product += ctx->product; +            unorm += ctx->unorm; +            qnorm += ctx->qnorm; +        }  + +        residual = (resprimal + resdual) / ((float) (dimX*dimY*dimZ)); +        float b = (2.0f * tau * sigma * product) / (gamma * sigma * unorm + gamma * tau * qnorm); +        float dual_dot_delta = resdual * s * delta; +        float dual_div_delta = (resdual * s) / delta; +        printf("resprimal: %f, resdual: %f, b: %f (product: %f, unorm: %f, qnorm: %f)\n", resprimal, resdual, b, product, unorm, qnorm); + + +        if(b > 1) { +             +            // Decrease step-sizes to fit balancing principle +            tau = (beta * tau) / b; +            sigma = (beta * sigma) / b; +            alpha = alpha0; + +            if (started) { +                fprintf(stderr, "\n\n\nWARNING: Back-tracking is required in the middle of iterative optimization! We CAN'T do it in the fast version. The standard TNV recommended\n\n\n"); +            } else { +                err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_restore); +                if (!err) err = hw_sched_wait_task(sched); +                if (err) { fprintf(stderr, "Error %i scheduling restore threads", err); exit(-1); } +            } +        } else { +            started = 1; +            if(resprimal > dual_dot_delta) { +                    // Increase primal step-size and decrease dual step-size +                tau = tau / (1.0f - alpha); +                sigma = sigma * (1.0f - alpha); +                alpha = alpha * eta; +            } else if(resprimal < dual_div_delta) { +                    // Decrease primal step-size and increase dual step-size +                tau = tau * (1.0f - alpha); +                sigma = sigma / (1.0f - alpha); +                alpha = alpha * eta; +            } +        } + +        if (residual < tol) break; +    } + +    err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_finish); +    if (!err) err = hw_sched_wait_task(sched); +    if (err) { fprintf(stderr, "Error %i scheduling finish threads", err); exit(-1); } + + +    printf("Iterations stopped at %i with the residual %f \n", iter, residual); +    printf("Return: %f\n", *uT); + +    return *uT; +} diff --git a/src/Core/performance_CPU/TNV_core.c.v4.stdver b/src/Core/performance_CPU/TNV_core.c.v4.stdver new file mode 100755 index 0000000..6be60ae --- /dev/null +++ b/src/Core/performance_CPU/TNV_core.c.v4.stdver @@ -0,0 +1,629 @@ +/* + * This work is part of the Core Imaging Library developed by + * Visual Analytics and Imaging System Group of the Science Technology + * Facilities Council, STFC + * + * Copyright 2017 Daniil Kazantsev + * Copyright 2017 Srikanth Nagella, Edoardo Pasca + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "TNV_core.h" + + +inline void coefF(float *t, float M1, float M2, float M3, float sigma, int p, int q, int r) { +    int ii, num; +    float divsigma = 1.0f / sigma; +    float sum, shrinkfactor; +    float T,D,det,eig1,eig2,sig1,sig2,V1, V2, V3, V4, v0,v1,v2, mu1,mu2,sig1_upd,sig2_upd; +    float proj[2] = {0}; + +    // Compute eigenvalues of M +    T = M1 + M3; +    D = M1 * M3 - M2 * M2; +    det = sqrt(MAX((T * T / 4.0f) - D, 0.0f)); +    eig1 = MAX((T / 2.0f) + det, 0.0f); +    eig2 = MAX((T / 2.0f) - det, 0.0f); +    sig1 = sqrt(eig1); +    sig2 = sqrt(eig2); + +    // Compute normalized eigenvectors +    V1 = V2 = V3 = V4 = 0.0f; + +    if(M2 != 0.0f) +    { +        v0 = M2; +        v1 = eig1 - M3; +        v2 = eig2 - M3; + +        mu1 = sqrtf(v0 * v0 + v1 * v1); +        mu2 = sqrtf(v0 * v0 + v2 * v2); + +        if(mu1 > fTiny) +        { +            V1 = v1 / mu1; +            V3 = v0 / mu1; +        } + +        if(mu2 > fTiny) +        { +            V2 = v2 / mu2; +            V4 = v0 / mu2; +        } + +    } else +    { +        if(M1 > M3) +        { +            V1 = V4 = 1.0f; +            V2 = V3 = 0.0f; + +        } else +        { +            V1 = V4 = 0.0f; +            V2 = V3 = 1.0f; +        } +    } + +    // Compute prox_p of the diagonal entries +    sig1_upd = sig2_upd = 0.0f; + +    if(p == 1) +    { +        sig1_upd = MAX(sig1 - divsigma, 0.0f); +        sig2_upd = MAX(sig2 - divsigma, 0.0f); + +    } else if(p == INFNORM) +    { +        proj[0] = sigma * fabs(sig1); +        proj[1] = sigma * fabs(sig2); + +        /*l1 projection part */ +        sum = fLarge; +        num = 0l; +        shrinkfactor = 0.0f; +        while(sum > 1.0f) +        { +            sum = 0.0f; +            num = 0; + +            for(ii = 0; ii < 2; ii++) +            { +                proj[ii] = MAX(proj[ii] - shrinkfactor, 0.0f); + +                sum += fabs(proj[ii]); +                if(proj[ii]!= 0.0f) +                    num++; +            } + +            if(num > 0) +                shrinkfactor = (sum - 1.0f) / num; +            else +                break; +        } +        /*l1 proj ends*/ + +        sig1_upd = sig1 - divsigma * proj[0]; +        sig2_upd = sig2 - divsigma * proj[1]; +    } + +    // Compute the diagonal entries of $\widehat{\Sigma}\Sigma^{\dagger}_0$ +    if(sig1 > fTiny) +        sig1_upd /= sig1; + +    if(sig2 > fTiny) +        sig2_upd /= sig2; + +    // Compute solution +    t[0] = sig1_upd * V1 * V1 + sig2_upd * V2 * V2; +    t[1] = sig1_upd * V1 * V3 + sig2_upd * V2 * V4; +    t[2] = sig1_upd * V3 * V3 + sig2_upd * V4 * V4; +} + + +/* + * C-OMP implementation of Total Nuclear Variation regularisation model (2D + channels) [1] + * The code is modified from the implementation by Joan Duran <joan.duran@uib.es> see + * "denoisingPDHG_ipol.cpp" in Joans Collaborative Total Variation package + * + * Input Parameters: + * 1. Noisy volume of 2D + channel dimension, i.e. 3D volume + * 2. lambda - regularisation parameter + * 3. Number of iterations [OPTIONAL parameter] + * 4. eplsilon - tolerance constant [OPTIONAL parameter] + * 5. print information: 0 (off) or 1 (on)  [OPTIONAL parameter] + * + * Output: + * 1. Filtered/regularized image (u) + * + * [1]. Duran, J., Moeller, M., Sbert, C. and Cremers, D., 2016. Collaborative total variation: a general framework for vectorial TV models. SIAM Journal on Imaging Sciences, 9(1), pp.116-151. + */ + +float TNV_CPU_main(float *InputT, float *uT, float lambda, int maxIter, float tol, int dimX, int dimY, int dimZ) +{ +    long i, j, k, p, q, r, DimTotal; +    float taulambda; +    float *u_upd, *qx, *qy, *qx_upd, *qy_upd, *gradx, *grady, *gradx_upd, *grady_upd,  *div, *div_upd; + +    p = 1l; +    q = 1l; +    r = 0l; + +    lambda = 1.0f/(2.0f*lambda); +    DimTotal = (long)(dimX*dimY*dimZ); +    /* PDHG algorithm parameters*/ +    float tau = 0.5f; +    float sigma = 0.5f; +    float theta = 1.0f; + +    // Auxiliar vectors +    u_upd = calloc(DimTotal, sizeof(float)); +    qx = calloc(DimTotal, sizeof(float)); +    qy = calloc(DimTotal, sizeof(float)); +    qx_upd = calloc(DimTotal, sizeof(float)); +    qy_upd = calloc(DimTotal, sizeof(float)); +    gradx = calloc(DimTotal, sizeof(float)); +    grady = calloc(DimTotal, sizeof(float)); +    gradx_upd = calloc(DimTotal, sizeof(float)); +    grady_upd = calloc(DimTotal, sizeof(float)); +    div = calloc(DimTotal, sizeof(float)); +    div_upd = calloc(DimTotal, sizeof(float)); + + +    float *Input = calloc(DimTotal, sizeof(float)); +    float *u = calloc(DimTotal, sizeof(float)); +    for(k=0; k<dimZ; k++) { +        for(j=0; j<dimY; j++) { +            for(i=0; i<dimX; i++) { +                Input[j * dimX * dimZ + i * dimZ + k] =  InputT[k * dimX * dimY + j * dimX + i]; +                u[j * dimX * dimZ + i * dimZ + k] =  uT[k * dimX * dimY + j * dimX + i]; +            } +        } +    } + + +    // Backtracking parameters +    float s = 1.0f; +    float gamma = 0.75f; +    float beta = 0.95f; +    float alpha0 = 0.2f; +    float alpha = alpha0; +    float delta = 1.5f; +    float eta = 0.95f; + +    // PDHG algorithm parameters +    taulambda = tau * lambda; +    float divtau = 1.0f / tau; +    float divsigma = 1.0f / sigma; +    float theta1 = 1.0f + theta; + +    /*allocate memory for  taulambda */ +    //taulambda = (float*) calloc(dimZ, sizeof(float)); +    //for(k=0; k < dimZ; k++)  {taulambda[k] = tau*lambda[k];} + +    // Apply Primal-Dual Hybrid Gradient scheme +    int iter = 0; +    float residual = fLarge; + +    for(iter = 0; iter < maxIter; iter++)   { +        // Argument of proximal mapping of fidelity term +        // Proximal solution of fidelity term +        // proxG(u_upd, u, div, Input, tau, taulambda, (long)(dimX), (long)(dimY), (long)(dimZ)); +        float constant = 1.0f + taulambda; +        #pragma omp parallel for shared(Input, u, u_upd) private(k) +        for(k=0; k<dimZ*dimX*dimY; k++) { +            float v = u[k] + tau * div[k]; +            u_upd[k] = (v + taulambda * Input[k])/constant; +        } + +    memset(div_upd, 0, dimX*dimY*dimZ*sizeof(float)); +            // This changes results, I guess access to div_upd is violated +//    #pragma omp parallel for shared (gradx_upd,grady_upd,gradx,grady,qx,qy,qx_upd,qy_upd) private(i,j,k) +        for(j=0; j<dimY; j++) { +            for(i=0; i<dimX; i++) { +                float t[3]; +                float M1 = 0.0f, M2 = 0.0f, M3 = 0.0f; +                int l = (j * dimX  + i) * dimZ; +                 +                for(k = 0; k < dimZ; k++) { +                    if(i != dimX-1) +                        gradx_upd[l + k] = u_upd[l + k + dimZ] - u_upd[l + k]; +                    else +                        gradx_upd[l + k] = 0.0f; + +                    if(j != dimY-1) +                        grady_upd[l + k] = u_upd[l + k + dimX * dimZ] - u_upd[l + k]; +                    else +                        grady_upd[l + k] = 0.0f; + +                    float ubarx = theta1 * gradx_upd[l + k] - theta * gradx[l + k]; +                    float ubary = theta1 * grady_upd[l + k] - theta * grady[l + k]; +                    float vx = ubarx + divsigma * qx[l + k]; +                    float vy = ubary + divsigma * qy[l + k]; +         +                    M1 += (vx * vx); M2 += (vx * vy); M3 += (vy * vy); +                } +                 +                coefF(t, M1, M2, M3, sigma, p, q, r); +                 +                for(k = 0; k < dimZ; k++) { +                    float ubarx = theta1 * gradx_upd[l + k] - theta * gradx[l + k]; +                    float ubary = theta1 * grady_upd[l + k] - theta * grady[l + k]; +                    float vx = ubarx + divsigma * qx[l + k]; +                    float vy = ubary + divsigma * qy[l + k]; + +                    float gx_upd = vx * t[0] + vy * t[1]; +                    float gy_upd = vx * t[1] + vy * t[2]; + +                    qx_upd[l + k] = qx[l + k] + sigma * (ubarx - gx_upd); +                    qy_upd[l + k] = qy[l + k] + sigma * (ubary - gy_upd); +                     +                    if(i != dimX-1) { +                        div_upd[l + k] += qx_upd[l + k]; +                        div_upd[l + k + dimZ] -= qx_upd[l + k]; +                    } +                     +                    if(j != dimY-1) { +                        div_upd[l + k] += qy_upd[l + k]; +                        div_upd[l + k + dimX * dimZ] -= qy_upd[l + k]; +                    } +                } +            } +        } + +// Compute primal residual, dual residual, and backtracking condition +        float resprimal = 0.0f; +        float resdual = 0.0f; +        float product = 0.0f; +        float unorm = 0.0f; +        float qnorm = 0.0f; + +            // If this loop is inner, the result slightly changed due to different summation order +        for(int l=0; l<dimZ; l++) +        for(j=0; j<dimY; j++) +        for(i=0; i<dimX; i++) +        { +//        for(k=0; k<dimX*dimY*dimZ; k++) { +            int k = j * dimX * dimZ + i * dimZ + l; +         +            float udiff = u[k] - u_upd[k]; +            float qxdiff = qx[k] - qx_upd[k]; +            float qydiff = qy[k] - qy_upd[k]; +            float divdiff = div[k] - div_upd[k]; +            float gradxdiff = gradx[k] - gradx_upd[k]; +            float gradydiff = grady[k] - grady_upd[k]; + +            resprimal += fabs(divtau*udiff + divdiff); +            resdual += fabs(divsigma*qxdiff - gradxdiff); +            resdual += fabs(divsigma*qydiff - gradydiff); + +            unorm += (udiff * udiff); +            qnorm += (qxdiff * qxdiff + qydiff * qydiff); +            product += (gradxdiff * qxdiff + gradydiff * qydiff); +        } + +        float b = (2.0f * tau * sigma * product) / (gamma * sigma * unorm + +                  gamma * tau * qnorm); + +//        printf("resprimal: %f, resdual: %f, b: %f\n", resprimal, resdual, b); +        printf("resprimal: %f, resdual: %f, b: %f (product: %f, unorm: %f, qnorm: %f)\n", resprimal, resdual, b, product, unorm, qnorm); + +//        printf("b: %f\n", b); + +// Adapt step-size parameters +        float dual_dot_delta = resdual * s * delta; +        float dual_div_delta = (resdual * s) / delta; + +        if(b > 1) +        { +            // Decrease step-sizes to fit balancing principle +            tau = (beta * tau) / b; +            sigma = (beta * sigma) / b; +            alpha = alpha0; + +            copyIm(u, u_upd, (long)(dimX), (long)(dimY), (long)(dimZ)); +            copyIm(qx, qx_upd, (long)(dimX), (long)(dimY), (long)(dimZ)); +            copyIm(qy, qy_upd, (long)(dimX), (long)(dimY), (long)(dimZ)); +            copyIm(gradx, gradx_upd, (long)(dimX), (long)(dimY), (long)(dimZ)); +            copyIm(grady, grady_upd, (long)(dimX), (long)(dimY), (long)(dimZ)); +            copyIm(div, div_upd, (long)(dimX), (long)(dimY), (long)(dimZ)); + +        } else if(resprimal > dual_dot_delta) +        { +            // Increase primal step-size and decrease dual step-size +            tau = tau / (1.0f - alpha); +            sigma = sigma * (1.0f - alpha); +            alpha = alpha * eta; + +        } else if(resprimal < dual_div_delta) +        { +            // Decrease primal step-size and increase dual step-size +            tau = tau * (1.0f - alpha); +            sigma = sigma / (1.0f - alpha); +            alpha = alpha * eta; +        } + +// Update variables +        taulambda = tau * lambda; +//for(k=0; k < dimZ; k++) taulambda[k] = tau*lambda[k]; + +        divsigma = 1.0f / sigma; +        divtau = 1.0f / tau; + +        copyIm(u_upd, u, (long)(dimX), (long)(dimY), (long)(dimZ)); +        copyIm(qx_upd, qx, (long)(dimX), (long)(dimY), (long)(dimZ)); +        copyIm(qy_upd, qy, (long)(dimX), (long)(dimY), (long)(dimZ)); +        copyIm(gradx_upd, gradx, (long)(dimX), (long)(dimY), (long)(dimZ)); +        copyIm(grady_upd, grady, (long)(dimX), (long)(dimY), (long)(dimZ)); +        copyIm(div_upd, div, (long)(dimX), (long)(dimY), (long)(dimZ)); + +// Compute residual at current iteration +        residual = (resprimal + resdual) / ((float) (dimX*dimY*dimZ)); + +//       printf("%f \n", residual); +        if (residual < tol) { +            printf("Iterations stopped at %i with the residual %f \n", iter, residual); +            break; +        } + +    } +    printf("Iterations stopped at %i with the residual %f \n", iter, residual); + +    free (u_upd); +    free(qx); +    free(qy); +    free(qx_upd); +    free(qy_upd); +    free(gradx); +    free(grady); +    free(gradx_upd); +    free(grady_upd); +    free(div); +    free(div_upd); +    printf("Return: %f\n", *u); + +    for(k=0; k<dimZ; k++) { +        for(j=0; j<dimY; j++) { +            for(i=0; i<dimX; i++) { +                uT[k * dimX * dimY + j * dimX + i] = u[j * dimX * dimZ + i * dimZ + k]; +            } +        } +    } + + + + +    return *u; +} + +float proxG(float *u_upd, float *v, float *f, float taulambda, long dimX, long dimY, long dimZ) +{ +    float constant; +    long k; +    constant = 1.0f + taulambda; +    #pragma omp parallel for shared(v, f, u_upd) private(k) +    for(k=0; k<dimZ*dimX*dimY; k++) { +        u_upd[k] = (v[k] + taulambda * f[k])/constant; +        //u_upd[(dimX*dimY)*k + l] = (v[(dimX*dimY)*k + l] + taulambda * f[(dimX*dimY)*k + l])/constant; +    } +    return *u_upd; +} + +float gradient(float *u_upd, float *gradx_upd, float *grady_upd, long dimX, long dimY, long dimZ) +{ +    long i, j, k, l; +    // Compute discrete gradient using forward differences +    #pragma omp parallel for shared(gradx_upd,grady_upd,u_upd) private(i, j, k, l) +    for(k = 0; k < dimZ; k++)   { +        for(j = 0; j < dimY; j++)   { +            l = j * dimX; +            for(i = 0; i < dimX; i++)   { +                // Derivatives in the x-direction +                if(i != dimX-1) +                    gradx_upd[(dimX*dimY)*k + i+l] = u_upd[(dimX*dimY)*k + i+1+l] - u_upd[(dimX*dimY)*k + i+l]; +                else +                    gradx_upd[(dimX*dimY)*k + i+l] = 0.0f; + +                // Derivatives in the y-direction +                if(j != dimY-1) +                    //grady_upd[(dimX*dimY)*k + i+l] = u_upd[(dimX*dimY)*k + i+dimY+l] -u_upd[(dimX*dimY)*k + i+l]; +                    grady_upd[(dimX*dimY)*k + i+l] = u_upd[(dimX*dimY)*k + i+(j+1)*dimX] -u_upd[(dimX*dimY)*k + i+l]; +                else +                    grady_upd[(dimX*dimY)*k + i+l] = 0.0f; +            } +        } +    } +    return 1; +} + +float proxF(float *gx, float *gy, float *vx, float *vy, float sigma, int p, int q, int r, long dimX, long dimY, long dimZ) +{ +    // (S^p, \ell^1) norm decouples at each pixel +//   Spl1(gx, gy, vx, vy, sigma, p, num_channels, dim); +    float divsigma = 1.0f / sigma; + +    // $\ell^{1,1,1}$-TV regularization +//       int i,j,k; +//     #pragma omp parallel for shared (gx,gy,vx,vy) private(i,j,k) +//      for(k = 0; k < dimZ; k++)  { +//         for(i=0; i<dimX; i++) { +//              for(j=0; j<dimY; j++) { +//                 gx[(dimX*dimY)*k + (i)*dimY + (j)] = SIGN(vx[(dimX*dimY)*k + (i)*dimY + (j)]) * MAX(fabs(vx[(dimX*dimY)*k + (i)*dimY + (j)]) - divsigma,  0.0f); +//                 gy[(dimX*dimY)*k + (i)*dimY + (j)] = SIGN(vy[(dimX*dimY)*k + (i)*dimY + (j)]) * MAX(fabs(vy[(dimX*dimY)*k + (i)*dimY + (j)]) - divsigma,  0.0f); +//             }}} + +    // Auxiliar vector +    float *proj, sum, shrinkfactor ; +    float M1,M2,M3,valuex,valuey,T,D,det,eig1,eig2,sig1,sig2,V1, V2, V3, V4, v0,v1,v2, mu1,mu2,sig1_upd,sig2_upd,t1,t2,t3; +    long i,j,k, ii, num; +    #pragma omp parallel for shared (gx,gy,vx,vy,p) private(i,ii,j,k,proj,num, sum, shrinkfactor, M1,M2,M3,valuex,valuey,T,D,det,eig1,eig2,sig1,sig2,V1, V2, V3, V4,v0,v1,v2,mu1,mu2,sig1_upd,sig2_upd,t1,t2,t3) +    for(i=0; i<dimX; i++) { +        for(j=0; j<dimY; j++) { + +            proj = (float*) calloc (2,sizeof(float)); +            // Compute matrix $M\in\R^{2\times 2}$ +            M1 = 0.0f; +            M2 = 0.0f; +            M3 = 0.0f; + +            for(k = 0; k < dimZ; k++) +            { +                valuex = vx[(dimX*dimY)*k + (j)*dimX + (i)]; +                valuey = vy[(dimX*dimY)*k + (j)*dimX + (i)]; + +                M1 += (valuex * valuex); +                M2 += (valuex * valuey); +                M3 += (valuey * valuey); +            } + +            // Compute eigenvalues of M +            T = M1 + M3; +            D = M1 * M3 - M2 * M2; +            det = sqrt(MAX((T * T / 4.0f) - D, 0.0f)); +            eig1 = MAX((T / 2.0f) + det, 0.0f); +            eig2 = MAX((T / 2.0f) - det, 0.0f); +            sig1 = sqrt(eig1); +            sig2 = sqrt(eig2); + +            // Compute normalized eigenvectors +            V1 = V2 = V3 = V4 = 0.0f; + +            if(M2 != 0.0f) +            { +                v0 = M2; +                v1 = eig1 - M3; +                v2 = eig2 - M3; + +                mu1 = sqrtf(v0 * v0 + v1 * v1); +                mu2 = sqrtf(v0 * v0 + v2 * v2); + +                if(mu1 > fTiny) +                { +                    V1 = v1 / mu1; +                    V3 = v0 / mu1; +                } + +                if(mu2 > fTiny) +                { +                    V2 = v2 / mu2; +                    V4 = v0 / mu2; +                } + +            } else +            { +                if(M1 > M3) +                { +                    V1 = V4 = 1.0f; +                    V2 = V3 = 0.0f; + +                } else +                { +                    V1 = V4 = 0.0f; +                    V2 = V3 = 1.0f; +                } +            } + +            // Compute prox_p of the diagonal entries +            sig1_upd = sig2_upd = 0.0f; + +            if(p == 1) +            { +                sig1_upd = MAX(sig1 - divsigma, 0.0f); +                sig2_upd = MAX(sig2 - divsigma, 0.0f); + +            } else if(p == INFNORM) +            { +                proj[0] = sigma * fabs(sig1); +                proj[1] = sigma * fabs(sig2); + +                /*l1 projection part */ +                sum = fLarge; +                num = 0l; +                shrinkfactor = 0.0f; +                while(sum > 1.0f) +                { +                    sum = 0.0f; +                    num = 0; + +                    for(ii = 0; ii < 2; ii++) +                    { +                        proj[ii] = MAX(proj[ii] - shrinkfactor, 0.0f); + +                        sum += fabs(proj[ii]); +                        if(proj[ii]!= 0.0f) +                            num++; +                    } + +                    if(num > 0) +                        shrinkfactor = (sum - 1.0f) / num; +                    else +                        break; +                } +                /*l1 proj ends*/ + +                sig1_upd = sig1 - divsigma * proj[0]; +                sig2_upd = sig2 - divsigma * proj[1]; +            } + +            // Compute the diagonal entries of $\widehat{\Sigma}\Sigma^{\dagger}_0$ +            if(sig1 > fTiny) +                sig1_upd /= sig1; + +            if(sig2 > fTiny) +                sig2_upd /= sig2; + +            // Compute solution +            t1 = sig1_upd * V1 * V1 + sig2_upd * V2 * V2; +            t2 = sig1_upd * V1 * V3 + sig2_upd * V2 * V4; +            t3 = sig1_upd * V3 * V3 + sig2_upd * V4 * V4; + +            for(k = 0; k < dimZ; k++) +            { +                gx[(dimX*dimY)*k + j*dimX + i] = vx[(dimX*dimY)*k + j*dimX + i] * t1 + vy[(dimX*dimY)*k + j*dimX + i] * t2; +                gy[(dimX*dimY)*k + j*dimX + i] = vx[(dimX*dimY)*k + j*dimX + i] * t2 + vy[(dimX*dimY)*k + j*dimX + i] * t3; +            } + +            // Delete allocated memory +            free(proj); +        } +    } + +    return 1; +} + +float divergence(float *qx_upd, float *qy_upd, float *div_upd, long dimX, long dimY, long dimZ) +{ +    long i, j, k, l; +    #pragma omp parallel for shared(qx_upd,qy_upd,div_upd) private(i, j, k, l) +    for(k = 0; k < dimZ; k++)   { +        for(j = 0; j < dimY; j++)   { +            l = j * dimX; +            for(i = 0; i < dimX; i++)   { +                if(i != dimX-1) +                { +                    // ux[k][i+l] = u[k][i+1+l] - u[k][i+l] +                    div_upd[(dimX*dimY)*k + i+1+l] -= qx_upd[(dimX*dimY)*k + i+l]; +                    div_upd[(dimX*dimY)*k + i+l] += qx_upd[(dimX*dimY)*k + i+l]; +                } + +                if(j != dimY-1) +                { +                    // uy[k][i+l] = u[k][i+width+l] - u[k][i+l] +                    //div_upd[(dimX*dimY)*k + i+dimY+l] -= qy_upd[(dimX*dimY)*k + i+l]; +                    div_upd[(dimX*dimY)*k + i+(j+1)*dimX] -= qy_upd[(dimX*dimY)*k + i+l]; +                    div_upd[(dimX*dimY)*k + i+l] += qy_upd[(dimX*dimY)*k + i+l]; +                } +            } +        } +    } +    return *div_upd; +} diff --git a/src/Core/performance_CPU/TNV_core_backtrack_loop.h.v19 b/src/Core/performance_CPU/TNV_core_backtrack_loop.h.v19 new file mode 100644 index 0000000..3ec4250 --- /dev/null +++ b/src/Core/performance_CPU/TNV_core_backtrack_loop.h.v19 @@ -0,0 +1,100 @@ +            float t[3]; +            float M1 = 0.0f, M2 = 0.0f, M3 = 0.0f; + +            l = (j * dimX  + i) * padZ; +         +//#pragma vector aligned +#pragma GCC ivdep  +            for(k = 0; k < dimZ; k++) { +                u_upd[l + k] = (u[l + k] + tau * div[l + k] + taulambda * Input[l + k]) / constant; +                udiff[k] = u[l + k] - u_upd[l + k]; +                unorm += (udiff[k] * udiff[k]); + +#ifdef TNV_LOOP_LAST_I +                gradx_upd[l + k] = 0; +#else +                gradx_upd[l + k] = ((u[l + k + padZ] + tau * div[l + k + padZ] + taulambda * Input[l + k + padZ]) / constant - u_upd[l + k]); +#endif + +#ifdef TNV_LOOP_LAST_J +                grady_upd[l + k] = 0; +#else +                grady_upd[l + k] = ((u[l + k + dimX*padZ] + tau * div[l + k + dimX*padZ] + taulambda * Input[l + k + dimX*padZ]) / constant - u_upd[l + k]); +#endif + +                gradxdiff[k] = gradx[l + k] - gradx_upd[l + k]; +                gradydiff[k] = grady[l + k] - grady_upd[l + k]; + +                float ubarx = theta1 * gradx_upd[l + k] - theta * gradx[l + k]; +                float ubary = theta1 * grady_upd[l + k] - theta * grady[l + k]; +//#define TNV_NEW_STYLE                 +#ifdef TNV_NEW_STYLE                 +                qx_upd[l + k] = qx[l + k] + sigma * ubarx; +                qy_upd[l + k] = qy[l + k] + sigma * ubary; + +                float vx = divsigma * qx_upd[l + k]; //+ ubarx +                float vy = divsigma * qy_upd[l + k]; //+ ubary +#else +                float vx = ubarx + divsigma * qx[l + k]; +                float vy = ubary + divsigma * qy[l + k]; +#endif + +                M1 += (vx * vx); M2 += (vx * vy); M3 += (vy * vy); +            } + +            coefF(t, M1, M2, M3, sigma, p, q, r); +             +//#pragma vector aligned +#pragma GCC ivdep  +            for(k = 0; k < dimZ; k++) { +#ifdef TNV_NEW_STYLE     +                float vx = divsigma * qx_upd[l + k]; +                float vy = divsigma * qy_upd[l + k]; + +                float gx_upd = vx * t[0] + vy * t[1]; +                float gy_upd = vx * t[1] + vy * t[2]; + +                qx_upd[l + k] -= sigma * gx_upd; +                qy_upd[l + k] -= sigma * gy_upd; +#else +                float ubarx = theta1 * gradx_upd[l + k] - theta * gradx[l + k]; +                float ubary = theta1 * grady_upd[l + k] - theta * grady[l + k]; +                float vx = ubarx + divsigma * qx[l + k]; +                float vy = ubary + divsigma * qy[l + k]; + +                float gx_upd = vx * t[0] + vy * t[1]; +                float gy_upd = vx * t[1] + vy * t[2]; + +                qx_upd[l + k] = qx[l + k] + sigma * (ubarx - gx_upd); +                qy_upd[l + k] = qy[l + k] + sigma * (ubary - gy_upd); +#endif + +                float div_upd_val = 0; +#ifndef TNV_LOOP_FIRST_I +                div_upd_val -= qx_upd[l + k - padZ]; +#endif + +#ifndef TNV_LOOP_FIRST_J +                div_upd_val -= qy_upd[l + k - dimX * padZ]; +#endif +#ifndef TNV_LOOP_LAST_I +                div_upd_val += qx_upd[l + k]; +#endif +#ifndef TNV_LOOP_LAST_J +                div_upd_val += qy_upd[l + k]; +#endif +                div_upd[l + k] = div_upd_val; + +                qxdiff = qx[l + k] - qx_upd[l + k]; +                qydiff = qy[l + k] - qy_upd[l + k]; +                qnorm += (qxdiff * qxdiff + qydiff * qydiff); + +                resdual1 += fabs(divsigma * qxdiff - gradxdiff[k]); +                resdual2 += fabs(divsigma * qydiff - gradydiff[k]); +                product += (gradxdiff[k] * qxdiff + gradydiff[k] * qydiff); + +#ifndef TNV_LOOP_FIRST_J +                divdiff = div[l + k] - div_upd[l + k];  // Multiple steps... How we compute without history? +                resprimal += fabs(divtau * udiff[k] + divdiff);  +#endif +            } diff --git a/src/Core/performance_CPU/TNV_core_loop.h.v32 b/src/Core/performance_CPU/TNV_core_loop.h.v32 new file mode 100644 index 0000000..be53156 --- /dev/null +++ b/src/Core/performance_CPU/TNV_core_loop.h.v32 @@ -0,0 +1,119 @@ +    { +            float t[3]; +            float M1 = 0.0f, M2 = 0.0f, M3 = 0.0f; +            l = (j * dimX  + i) * padZ; +            m = dimX * padZ; +         +            float *__restrict u_next = u + l + padZ; +            float *__restrict u_current = u + l; +            float *__restrict u_next_row = u + l + m; + + +            float *__restrict qx_current = qx + l; +            float *__restrict qy_current = qy + l; +            float *__restrict qx_prev = qx + l - padZ; +            float *__restrict qy_prev = qy + l - m; + + +  __assume(padZ%16==0); +/* +  __assume_aligned(Input, 64); +  __assume_aligned(div, 64); +  __assume_aligned(gradx, 64); +  __assume_aligned(grady, 64); +  __assume_aligned(u, 64); +  __assume_aligned(qx, 64); +  __assume_aligned(qy, 64); +  __assume_aligned(u_current, 64); +  __assume_aligned(u_next, 64); +  __assume_aligned(u_next_row, 64); +*/ + +#pragma vector aligned +#pragma GCC ivdep  +            for(k = 0; k < dimZ; k++) { +                float u_upd = (u[l + k] + tau * div[l + k] + taulambda * Input[l + k]) / constant; // 3 reads +                udiff[k] = u[l + k] - u_upd; // cache 1w +                u[l + k] = u_upd; // 1 write + +#ifdef TNV_LOOP_FIRST_J +                udiff0[l + k] = udiff[k]; +                div0[l + k] = div[l + k]; +#endif + +#ifdef TNV_LOOP_LAST_I +                float gradx_upd = 0; +#else +                float u_next_upd = (u[l + k + padZ] + tau * div[l + k + padZ] + taulambda * Input[l + k + padZ]) / constant; // 3 reads +                float gradx_upd = (u_next_upd - u_upd); // 2 reads +#endif + +#ifdef TNV_LOOP_LAST_J +                float grady_upd = 0; +#else +                float u_next_row_upd = (u[l + k + m] + tau * div[l + k + m] + taulambda * Input[l + k + m]) / constant; // 3 reads +                float grady_upd = (u_next_row_upd - u_upd); // 1 read +#endif + +                gradxdiff[k] = gradx[l + k] - gradx_upd; // 1 read, cache 1w +                gradydiff[k] = grady[l + k] - grady_upd; // 1 read, cache 1w +                gradx[l + k] = gradx_upd; // 1 write +                grady[l + k] = grady_upd; // 1 write +                 +                ubarx[k] = gradx_upd - theta * gradxdiff[k]; // cache 1w +                ubary[k] = grady_upd - theta * gradydiff[k]; // cache 1w + +                float vx = ubarx[k] + divsigma * qx[l + k]; // 1 read +                float vy = ubary[k] + divsigma * qy[l + k]; // 1 read + +                M1 += (vx * vx); M2 += (vx * vy); M3 += (vy * vy); +            } + +            coefF(t, M1, M2, M3, sigma, p, q, r); +             +#pragma vector aligned +#pragma GCC ivdep  +            for(k = 0; k < padZ; k++) { +                float vx = ubarx[k] + divsigma * qx_current[k]; // cache 2r +                float vy = ubary[k] + divsigma * qy_current[k]; // cache 2r +                float gx_upd = vx * t[0] + vy * t[1]; +                float gy_upd = vx * t[1] + vy * t[2]; + +                qxdiff = sigma * (ubarx[k] - gx_upd); +                qydiff = sigma * (ubary[k] - gy_upd); +                 +                qx_current[k] += qxdiff; // write 1 +                qy_current[k] += qydiff; // write 1 + +                unorm += (udiff[k] * udiff[k]); +                qnorm += (qxdiff * qxdiff + qydiff * qydiff); + +                float div_upd = 0; + +#ifndef TNV_LOOP_FIRST_I +                div_upd -= qx_prev[k]; // 1 read +#endif +#ifndef TNV_LOOP_FIRST_J +                div_upd -= qy_prev[k]; // 1 read +#endif +#ifndef TNV_LOOP_LAST_I +                div_upd += qx_current[k];  +#endif +#ifndef TNV_LOOP_LAST_J +                div_upd += qy_current[k];  +#endif + +                divdiff = div[l + k] - div_upd;  // 1 read +                div[l + k] = div_upd; // 1 write + +#ifndef TNV_LOOP_FIRST_J +                resprimal += fabs(divtau * udiff[k] + divdiff);  +#endif + +                    // We need to have two independent accumulators to allow gcc-autovectorization +                resdual1 += fabs(divsigma * qxdiff + gradxdiff[k]); // cache 1r +                resdual2 += fabs(divsigma * qydiff + gradydiff[k]); // cache 1r +                product -= (gradxdiff[k] * qxdiff + gradydiff[k] * qydiff); +            } +    } +    
\ No newline at end of file | 
