diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/Core/CMakeLists.txt | 13 | ||||
| -rwxr-xr-x | src/Core/regularisers_CPU/TNV_core.c | 216 | ||||
| -rwxr-xr-x | src/Core/regularisers_CPU/TNV_core_backtrack.c | 189 | ||||
| -rw-r--r-- | src/Core/regularisers_CPU/TNV_core_backtrack_loop.h | 100 | ||||
| -rw-r--r-- | src/Core/regularisers_CPU/TNV_core_loop.h | 107 | 
5 files changed, 423 insertions, 202 deletions
| diff --git a/src/Core/CMakeLists.txt b/src/Core/CMakeLists.txt index 10b16f3..76b0f3e 100644 --- a/src/Core/CMakeLists.txt +++ b/src/Core/CMakeLists.txt @@ -60,9 +60,16 @@ message("CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}")  message("Adding regularisers as a shared library")  #set(CMAKE_C_COMPILER /opt/intel/compilers_and_libraries/linux/bin/intel64/icc) -#set(CMAKE_C_FLAGS "-Ofast -mtune=sandybridge -xSSE4.2 -qopt-report=2 -qopt-report-phase=vec -qopenmp") -set(CMAKE_C_FLAGS "-march=nocona -msse -msse2 -msse3 -mssse3 -msse4 -msse4.1 -msse4.2 -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -mprefer-vector-width=128 -fopenmp") -#set(CMAKE_C_FLAGS "-march=native -msse -msse2 -msse3 -mssse3 -msse4 -msse4.1 -msse4.2 -mavx -mavx2 -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -mprefer-vector-width=512 -fopenmp") +#set(CMAKE_C_FLAGS "-Ofast -mtune=sandybridge -xSSE4.2 -qopt-report=5 -qopt-report-file=stdout -qopt-report-phase=vec -qopenmp -g") +#set(CMAKE_C_FLAGS "-Ofast -mtune=sandybridge -axAVX2 -xAVX2 -qopt-report=5 -qopt-report-file=stdout -qopt-report-phase=vec -qopenmp -g") +#set(CMAKE_C_FLAGS "-Ofast -mtune=sandybridge -mavx512f -mavx512dq -mavx512bw -mavx512vbmi -mavx512vbmi2 -mavx512vl -qopt-report=5 -qopt-report-file=stdout -qopt-report-phase=vec -qopenmp -g") + +#set(CMAKE_C_COMPILER clang) +#set(CMAKE_C_FLAGS "-march=nocona -msse -msse2 -msse3 -mssse3 -msse4 -msse4.1 -msse4.2 -ftree-vectorize -fopenmp") + +#set(CMAKE_C_FLAGS "-march=nocona -msse -msse2 -msse3 -mssse3 -msse4 -msse4.1 -msse4.2 -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -mprefer-vector-width=128 -fopenmp") +set(CMAKE_C_FLAGS "-march=native -mavx2 -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -mprefer-vector-width=512 -fopenmp") +#set(CMAKE_C_FLAGS "-march=native -mavx512f -mavx512dq -mavx512bw -mavx512vbmi -mavx512vbmi2 -mavx512vl -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -mprefer-vector-width=512 -fopenmp")  #set(CMAKE_C_FLAGS_RELEASE "-g -gdwarf-2 -g3 -fno-omit-frame-pointer")  #set(CMAKE_C_FLAGS "-acc -Minfo -ta=tesla:cc20 -openmp") diff --git a/src/Core/regularisers_CPU/TNV_core.c b/src/Core/regularisers_CPU/TNV_core.c index dce414a..415c644 100755 --- a/src/Core/regularisers_CPU/TNV_core.c +++ b/src/Core/regularisers_CPU/TNV_core.c @@ -5,12 +5,6 @@   *   * Copyright 2017 Daniil Kazantsev   * Copyright 2017 Srikanth Nagella, Edoardo Pasca - *  - * Copyriht 2020 Suren A. Chlingaryan - * Optimized version with 1/3 of memory consumption and ~10x performance - * This version is not able to perform back-track except during first iterations - * But warning would be printed if backtracking is required and slower version (TNV_core_backtrack.c)  - * could be executed instead. It still better than original with 1/2 of memory consumption and 4x performance gain   *   * Licensed under the Apache License, Version 2.0 (the "License");   * you may not use this file except in compliance with the License. @@ -23,6 +17,7 @@   * limitations under the License.   */ +#include <malloc.h>  #include "TNV_core.h"  #define BLOCK 32 @@ -143,6 +138,7 @@ typedef struct {      int offY, stepY, copY;      float *Input, *u, *qx, *qy, *gradx, *grady, *div;      float *div0, *udiff0; +    float *gradxdiff, *gradydiff, *ubarx, *ubary, *udiff;      float resprimal, resdual;      float unorm, qnorm, product;  } tnv_thread_t; @@ -174,7 +170,13 @@ static int tnv_free(HWThread thr, void *hwctx, int device_id, void *data) {      free(ctx->div0);      free(ctx->udiff0); -     + +    free(ctx->gradxdiff);  +    free(ctx->gradydiff); +    free(ctx->ubarx); +    free(ctx->ubary); +    free(ctx->udiff); +      return 0;  } @@ -194,6 +196,7 @@ static int tnv_init(HWThread thr, void *hwctx, int device_id, void *data) {      long DimTotal = (long)(dimX*stepY*padZ);      long Dim1Total = (long)(dimX*(stepY+1)*padZ);      long DimRow = (long)(dimX * padZ); +    long DimCell = (long)(padZ);      // Auxiliar vectors      ctx->Input = memalign(64, Dim1Total * sizeof(float)); @@ -207,7 +210,13 @@ static int tnv_init(HWThread thr, void *hwctx, int device_id, void *data) {      ctx->div0 = memalign(64, DimRow * sizeof(float));      ctx->udiff0 = memalign(64, DimRow * sizeof(float)); -    if ((!ctx->Input)||(!ctx->u)||(!ctx->qx)||(!ctx->qy)||(!ctx->gradx)||(!ctx->grady)||(!ctx->div)||(!ctx->div0)||(!ctx->udiff0)) { +    ctx->gradxdiff = memalign(64, DimCell * sizeof(float)); +    ctx->gradydiff = memalign(64, DimCell * sizeof(float)); +    ctx->ubarx = memalign(64, DimCell * sizeof(float)); +    ctx->ubary = memalign(64, DimCell * sizeof(float)); +    ctx->udiff = memalign(64, DimCell * sizeof(float)); +     +    if ((!ctx->Input)||(!ctx->u)||(!ctx->qx)||(!ctx->qy)||(!ctx->gradx)||(!ctx->grady)||(!ctx->div)||(!ctx->div0)||(!ctx->udiff)||(!ctx->udiff0)) {          fprintf(stderr, "Error allocating memory\n");          exit(-1);      } @@ -303,7 +312,7 @@ static int tnv_restore(HWThread thr, void *hwctx, int device_id, void *data) {  static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) { -    long i, j, k, l; +    long i, j, k, l, m;      tnv_context_t *tnv_ctx = (tnv_context_t*)data;      tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; @@ -340,7 +349,8 @@ static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) {      float constant = 1.0f + taulambda;      float resprimal = 0.0f; -    float resdual = 0.0f; +    float resdual1 = 0.0f; +    float resdual2 = 0.0f;      float product = 0.0f;      float unorm = 0.0f;      float qnorm = 0.0f; @@ -348,100 +358,92 @@ static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) {      float qxdiff;      float qydiff;      float divdiff; -    float gradxdiff[dimZ] __attribute__((aligned(64))); -    float gradydiff[dimZ] __attribute__((aligned(64))); -    float ubarx[dimZ] __attribute__((aligned(64))); -    float ubary[dimZ] __attribute__((aligned(64))); -    float udiff[dimZ] __attribute__((aligned(64))); - - -    for(i=0; i < dimX; i++) { -        for(k = 0; k < dimZ; k++) {  -            int l = i * padZ + k; -            float u_upd = (u[l] + tau * div[l] + taulambda * Input[l])/constant; -            float udiff = u[l] - u_upd; -            ctx->udiff0[l] = udiff; -            ctx->div0[l] = div[l]; +    float *gradxdiff = ctx->gradxdiff; +    float *gradydiff = ctx->gradydiff; +    float *ubarx = ctx->ubarx; +    float *ubary = ctx->ubary; +    float *udiff = ctx->udiff; + +    float *udiff0 = ctx->udiff0; +    float *div0 = ctx->div0; + + +    j = 0; { +#       define TNV_LOOP_FIRST_J +        i = 0; { +#           define TNV_LOOP_FIRST_I +#           include "TNV_core_loop.h" +#           undef TNV_LOOP_FIRST_I +        } +        for(i = 1; i < (dimX - 1); i++) { +#           include "TNV_core_loop.h"          } +        i = dimX - 1; { +#           define TNV_LOOP_LAST_I +#           include "TNV_core_loop.h" +#           undef TNV_LOOP_LAST_I +        } +#       undef TNV_LOOP_FIRST_J      } -    for(int j1 = 0; j1 < stepY; j1 += BLOCK) { -     for(int i1 = 0; i1 < dimX; i1 += BLOCK) { -      for(int j2 = 0; j2 < BLOCK; j2++) { -        j = j1 + j2; -        for(int i2 = 0; i2 < BLOCK; i2++) { -            float t[3]; -            float M1 = 0.0f, M2 = 0.0f, M3 = 0.0f; - -            i = i1 + i2; -            if (i == dimX) break; -            if (j == stepY) { j2 = BLOCK; break; } -            l = (j * dimX  + i) * padZ; - -#pragma vector aligned -#pragma GCC ivdep  -            for(k = 0; k < dimZ; k++) { -                float u_upd = (u[l + k] + tau * div[l + k] + taulambda * Input[l + k]) / constant; -                udiff[k] = u[l + k] - u_upd; -                u[l + k] = u_upd; - -                float gradx_upd = (i == (dimX - 1))?0:((u[l + k + padZ] + tau * div[l + k + padZ] + taulambda * Input[l + k + padZ]) / constant - u_upd); -                float grady_upd = (j == (copY - 1))?0:((u[l + k + dimX*padZ] + tau * div[l + k + dimX*padZ] + taulambda * Input[l + k + dimX*padZ]) / constant - u_upd); -                gradxdiff[k] = gradx[l + k] - gradx_upd; -                gradydiff[k] = grady[l + k] - grady_upd; -                gradx[l + k] = gradx_upd; -                grady[l + k] = grady_upd; -                 -                ubarx[k] = gradx_upd - theta * gradxdiff[k]; -                ubary[k] = grady_upd - theta * gradydiff[k]; - -                float vx = ubarx[k] + divsigma * qx[l + k]; -                float vy = ubary[k] + divsigma * qy[l + k]; - -                M1 += (vx * vx); M2 += (vx * vy); M3 += (vy * vy); -            } -            coefF(t, M1, M2, M3, sigma, p, q, r); -             -#pragma vector aligned -#pragma GCC ivdep  -            for(k = 0; k < dimZ; k++) { -                float vx = ubarx[k] + divsigma * qx[l + k]; -                float vy = ubary[k] + divsigma * qy[l + k]; -                float gx_upd = vx * t[0] + vy * t[1]; -                float gy_upd = vx * t[1] + vy * t[2]; - -                qxdiff = sigma * (ubarx[k] - gx_upd); -                qydiff = sigma * (ubary[k] - gy_upd); -                 -                qx[l + k] += qxdiff; -                qy[l + k] += qydiff; - -                unorm += (udiff[k] * udiff[k]); -                qnorm += (qxdiff * qxdiff + qydiff * qydiff); - -                float div_upd = 0; -                div_upd -= (i > 0)?qx[l + k - padZ]:0; -                div_upd -= (j > 0)?qy[l + k - dimX*padZ]:0; -                div_upd += (i < (dimX-1))?qx[l + k]:0; -                div_upd += (j < (copY-1))?qy[l + k]:0; -                divdiff = div[l + k] - div_upd;   -                div[l + k] = div_upd; - -                resprimal +=  ((offY == 0)||(j > 0))?fabs(divtau * udiff[k] + divdiff):0;  -                resdual += fabs(divsigma * qxdiff + gradxdiff[k]); -                resdual += fabs(divsigma * qydiff + gradydiff[k]); -                product -= (gradxdiff[k] * qxdiff + gradydiff[k] * qydiff); + +    for(int j = 1; j < (copY - 1); j++) { +        i = 0; { +#           define TNV_LOOP_FIRST_I +#           include "TNV_core_loop.h" +#           undef TNV_LOOP_FIRST_I +        } +    } + +    for(int j1 = 1; j1 < (copY - 1); j1 += BLOCK) { +        for(int i1 = 1; i1 < (dimX - 1); i1 += BLOCK) { +            for(int j2 = 0; j2 < BLOCK; j2 ++) { +                j = j1 + j2; +                for(int i2 = 0; i2 < BLOCK; i2++) { +                    i = i1 + i2; +                     +                    if (i == (dimX - 1)) break; +                    if (j == (copY - 1)) { j2 = BLOCK; break; } +#           include "TNV_core_loop.h" +                }                 } -                      } // i -       } // j -     } // i -    } // j + +    } + +    for(int j = 1; j < (copY - 1); j++) { +        i = dimX - 1; { +#           define TNV_LOOP_LAST_I +#           include "TNV_core_loop.h" +#           undef TNV_LOOP_LAST_I +        } +    } + + + +    for (j = copY - 1; j < stepY; j++) { +#       define TNV_LOOP_LAST_J +        i = 0; { +#           define TNV_LOOP_FIRST_I +#           include "TNV_core_loop.h" +#           undef TNV_LOOP_FIRST_I +        } +        for(i = 1; i < (dimX - 1); i++) { +#           include "TNV_core_loop.h" +        } +        i = dimX - 1; { +#           define TNV_LOOP_LAST_I +#           include "TNV_core_loop.h" +#           undef TNV_LOOP_LAST_I +        } +#       undef TNV_LOOP_LAST_J +    } +      ctx->resprimal = resprimal; -    ctx->resdual = resdual; +    ctx->resdual = resdual1 + resdual2;      ctx->product = product;      ctx->unorm = unorm;      ctx->qnorm = qnorm; @@ -458,9 +460,9 @@ static void TNV_CPU_init(float *InputT, float *uT, int dimX, int dimY, int dimZ)      tnv_ctx.dimY = dimY;      tnv_ctx.dimZ = dimZ;          // Padding seems actually slower -//    tnv_ctx.padZ = dimZ; +    tnv_ctx.padZ = dimZ;  //    tnv_ctx.padZ = 4 * ((dimZ / 4) + ((dimZ % 4)?1:0)); -    tnv_ctx.padZ = 16 * ((dimZ / 16) + ((dimZ % 16)?1:0)); +//    tnv_ctx.padZ = 16 * ((dimZ / 16) + ((dimZ % 16)?1:0));      hw_sched_init(); @@ -580,15 +582,28 @@ float TNV_CPU_main(float *InputT, float *uT, float lambda, int maxIter, float to                      float udiff = ctx->udiff0[l];                      ctx->div[l] -= ctx0->qy[l + m]; -                    ctx0->div[m + l + dimX * padZ] = ctx->div[l]; -                    ctx0->u[m + l + dimX * padZ] = ctx->u[l]; - +                    ctx0->div[m + l + dimX*padZ] = ctx->div[l]; +                    ctx0->u[m + l + dimX*padZ] = ctx->u[l]; +                                          divdiff += ctx0->qy[l + m];                      resprimal += fabs(divtau * udiff + divdiff);                   }              }          } +        { +            tnv_thread_t *ctx = tnv_ctx.thr_ctx + 0; +            for(i = 0; i < dimX; i++) { +                for(k = 0; k < dimZ; k++) { +                    int l = i * padZ + k; +                         +                    float divdiff = ctx->div0[l] - ctx->div[l]; +                    float udiff = ctx->udiff0[l]; +                    resprimal += fabs(divtau * udiff + divdiff);  +                } +            } +        } +          for (j = 0; j < tnv_ctx.threads; j++) {              tnv_thread_t *ctx = tnv_ctx.thr_ctx + j;              resprimal += ctx->resprimal; @@ -645,6 +660,5 @@ float TNV_CPU_main(float *InputT, float *uT, float lambda, int maxIter, float to      printf("Iterations stopped at %i with the residual %f \n", iter, residual);      printf("Return: %f\n", *uT); -//    exit(-1);      return *uT;  } diff --git a/src/Core/regularisers_CPU/TNV_core_backtrack.c b/src/Core/regularisers_CPU/TNV_core_backtrack.c index 7192475..9b19ed5 100755 --- a/src/Core/regularisers_CPU/TNV_core_backtrack.c +++ b/src/Core/regularisers_CPU/TNV_core_backtrack.c @@ -335,6 +335,7 @@ static int tnv_restore(HWThread thr, void *hwctx, int device_id, void *data) {  static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) {      long i, j, k, l; +    long i1, i2, j1, j2;      tnv_context_t *tnv_ctx = (tnv_context_t*)data;      tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; @@ -376,11 +377,12 @@ static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) {      float theta1 = 1.0f + theta;      float constant = 1.0f + taulambda; -    float resprimal = 0.0; -    float resdual = 0.0; -    float product = 0.0; -    float unorm = 0.0; -    float qnorm = 0.0; +    float resprimal = 0.0f; +    float resdual1 = 0.0f; +    float resdual2 = 0.0f; +    float product = 0.0f; +    float unorm = 0.0f; +    float qnorm = 0.0f;      float udiff[dimZ] __attribute__((aligned(64)));      float qxdiff __attribute__((aligned(64))); @@ -389,104 +391,82 @@ static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) {      float gradxdiff[dimZ] __attribute__((aligned(64)));      float gradydiff[dimZ] __attribute__((aligned(64))); -    for(int j1 = 0; j1 < stepY; j1 += BLOCK) { -     for(int i1 = 0; i1 < dimX; i1 += BLOCK) { -      for(int j2 = 0; j2 < BLOCK; j2++) { -        j = j1 + j2; -        for(int i2 = 0; i2 < BLOCK; i2++) { -            float t[3]; -            float M1 = 0.0f, M2 = 0.0f, M3 = 0.0f; - -            i = i1 + i2; -            if (i == dimX) break; -            if (j == stepY) { j2 = BLOCK; break; } -            l = (j * dimX  + i) * padZ; -         -//#pragma vector aligned -#pragma GCC ivdep  -            for(k = 0; k < dimZ; k++) { -                u_upd[l + k] = (u[l + k] + tau * div[l + k] + taulambda * Input[l + k]) / constant; -                udiff[k] = u[l + k] - u_upd[l + k]; -                unorm += (udiff[k] * udiff[k]); - -                gradx_upd[l + k] = (i == (dimX - 1))?0:((u[l + k + padZ] + tau * div[l + k + padZ] + taulambda * Input[l + k + padZ]) / constant - u_upd[l + k]); -                grady_upd[l + k] = (j == (copY - 1))?0:((u[l + k + dimX*padZ] + tau * div[l + k + dimX*padZ] + taulambda * Input[l + k + dimX*padZ]) / constant - u_upd[l + k]); -                gradxdiff[k] = gradx[l + k] - gradx_upd[l + k]; -                gradydiff[k] = grady[l + k] - grady_upd[l + k]; - -                float ubarx = theta1 * gradx_upd[l + k] - theta * gradx[l + k]; -                float ubary = theta1 * grady_upd[l + k] - theta * grady[l + k]; -//#define TNV_NEW_STYLE                 -#ifdef TNV_NEW_STYLE                 -                qx_upd[l + k] = qx[l + k] + sigma * ubarx; -                qy_upd[l + k] = qy[l + k] + sigma * ubary; - -                float vx = divsigma * qx_upd[l + k]; //+ ubarx -                float vy = divsigma * qy_upd[l + k]; //+ ubary -#else -                float vx = ubarx + divsigma * qx[l + k]; -                float vy = ubary + divsigma * qy[l + k]; -#endif - -                M1 += (vx * vx); M2 += (vx * vy); M3 += (vy * vy); -            } -            coefF(t, M1, M2, M3, sigma, p, q, r); -             -//#pragma vector aligned -#pragma GCC ivdep  -            for(k = 0; k < dimZ; k++) { -#ifdef TNV_NEW_STYLE     -                float vx = divsigma * qx_upd[l + k]; -                float vy = divsigma * qy_upd[l + k]; - -                float gx_upd = vx * t[0] + vy * t[1]; -                float gy_upd = vx * t[1] + vy * t[2]; - -                qx_upd[l + k] -= sigma * gx_upd; -                qy_upd[l + k] -= sigma * gy_upd; -#else -                float ubarx = theta1 * gradx_upd[l + k] - theta * gradx[l + k]; -                float ubary = theta1 * grady_upd[l + k] - theta * grady[l + k]; -                float vx = ubarx + divsigma * qx[l + k]; -                float vy = ubary + divsigma * qy[l + k]; - -                float gx_upd = vx * t[0] + vy * t[1]; -                float gy_upd = vx * t[1] + vy * t[2]; - -                qx_upd[l + k] = qx[l + k] + sigma * (ubarx - gx_upd); -                qy_upd[l + k] = qy[l + k] + sigma * (ubary - gy_upd); -#endif - -                float div_upd_val = 0; -                div_upd_val -= (i > 0)?qx_upd[l + k - padZ]:0; -                div_upd_val -= (j > 0)?qy_upd[l + k - dimX * padZ]:0; -                div_upd_val += (i < (dimX-1))?qx_upd[l + k]:0; -                div_upd_val += (j < (copY-1))?qy_upd[l + k]:0; -                div_upd[l + k] = div_upd_val; - -                qxdiff = qx[l + k] - qx_upd[l + k]; -                qydiff = qy[l + k] - qy_upd[l + k]; -                qnorm += (qxdiff * qxdiff + qydiff * qydiff); - -                resdual += fabs(divsigma * qxdiff - gradxdiff[k]); -                resdual += fabs(divsigma * qydiff - gradydiff[k]); -                product += (gradxdiff[k] * qxdiff + gradydiff[k] * qydiff); - -                if ((offY == 0)||(j > 0)) { -                    divdiff = div[l + k] - div_upd[l + k];  // Multiple steps... How we compute without history? -                    resprimal += fabs(divtau * udiff[k] + divdiff);  -                } +    j = 0; { +#       define TNV_LOOP_FIRST_J +        i = 0; { +#           define TNV_LOOP_FIRST_I +#           include "TNV_core_backtrack_loop.h" +#           undef TNV_LOOP_FIRST_I +        } +        for(i = 1; i < (dimX - 1); i++) { +#           include "TNV_core_backtrack_loop.h" +        } +        i = dimX - 1; { +#           define TNV_LOOP_LAST_I +#           include "TNV_core_backtrack_loop.h" +#           undef TNV_LOOP_LAST_I +        } +#       undef TNV_LOOP_FIRST_J +    } + + + +    for(int j = 1; j < (copY - 1); j++) { +        i = 0; { +#           define TNV_LOOP_FIRST_I +#           include "TNV_core_backtrack_loop.h" +#           undef TNV_LOOP_FIRST_I +        } +    } + +    for(int j1 = 1; j1 < (copY - 1); j1 += BLOCK) { +        for(int i1 = 1; i1 < (dimX - 1); i1 += BLOCK) { +            for(int j2 = 0; j2 < BLOCK; j2 ++) { +                j = j1 + j2; +                for(int i2 = 0; i2 < BLOCK; i2++) { +                    i = i1 + i2; +                     +                    if (i == (dimX - 1)) break; +                    if (j == (copY - 1)) { j2 = BLOCK; break; } +#           include "TNV_core_backtrack_loop.h" +                }                 } -                      } // i -       } // j -      } // i -    } // j -     + +    } + +    for(int j = 1; j < (copY - 1); j++) { +        i = dimX - 1; { +#           define TNV_LOOP_LAST_I +#           include "TNV_core_backtrack_loop.h" +#           undef TNV_LOOP_LAST_I +        } +    } + + + +    for (j = copY - 1; j < stepY; j++) { +#       define TNV_LOOP_LAST_J +        i = 0; { +#           define TNV_LOOP_FIRST_I +#           include "TNV_core_backtrack_loop.h" +#           undef TNV_LOOP_FIRST_I +        } +        for(i = 1; i < (dimX - 1); i++) { +#           include "TNV_core_backtrack_loop.h" +        } +        i = dimX - 1; { +#           define TNV_LOOP_LAST_I +#           include "TNV_core_backtrack_loop.h" +#           undef TNV_LOOP_LAST_I +        } +#       undef TNV_LOOP_LAST_J +    } +      ctx->resprimal = resprimal; -    ctx->resdual = resdual; +    ctx->resdual = resdual1 + resdual2;      ctx->product = product;      ctx->unorm = unorm;      ctx->qnorm = qnorm; @@ -630,6 +610,19 @@ float TNV_CPU_main(float *InputT, float *uT, float lambda, int maxIter, float to              }          } +        { +            tnv_thread_t *ctx = tnv_ctx.thr_ctx + 0; +            for(i = 0; i < dimX; i++) { +                for(k = 0; k < dimZ; k++) { +                    int l = i * padZ + k; + +                    float divdiff = ctx->div[l] - ctx->div_upd[l]; +                    float udiff = ctx->u[l] - ctx->u_upd[l]; +                    resprimal += fabs(divtau * udiff + divdiff);  +                } +            } +        } +          for (j = 0; j < tnv_ctx.threads; j++) {              tnv_thread_t *ctx = tnv_ctx.thr_ctx + j;              resprimal += ctx->resprimal; diff --git a/src/Core/regularisers_CPU/TNV_core_backtrack_loop.h b/src/Core/regularisers_CPU/TNV_core_backtrack_loop.h new file mode 100644 index 0000000..3ec4250 --- /dev/null +++ b/src/Core/regularisers_CPU/TNV_core_backtrack_loop.h @@ -0,0 +1,100 @@ +            float t[3]; +            float M1 = 0.0f, M2 = 0.0f, M3 = 0.0f; + +            l = (j * dimX  + i) * padZ; +         +//#pragma vector aligned +#pragma GCC ivdep  +            for(k = 0; k < dimZ; k++) { +                u_upd[l + k] = (u[l + k] + tau * div[l + k] + taulambda * Input[l + k]) / constant; +                udiff[k] = u[l + k] - u_upd[l + k]; +                unorm += (udiff[k] * udiff[k]); + +#ifdef TNV_LOOP_LAST_I +                gradx_upd[l + k] = 0; +#else +                gradx_upd[l + k] = ((u[l + k + padZ] + tau * div[l + k + padZ] + taulambda * Input[l + k + padZ]) / constant - u_upd[l + k]); +#endif + +#ifdef TNV_LOOP_LAST_J +                grady_upd[l + k] = 0; +#else +                grady_upd[l + k] = ((u[l + k + dimX*padZ] + tau * div[l + k + dimX*padZ] + taulambda * Input[l + k + dimX*padZ]) / constant - u_upd[l + k]); +#endif + +                gradxdiff[k] = gradx[l + k] - gradx_upd[l + k]; +                gradydiff[k] = grady[l + k] - grady_upd[l + k]; + +                float ubarx = theta1 * gradx_upd[l + k] - theta * gradx[l + k]; +                float ubary = theta1 * grady_upd[l + k] - theta * grady[l + k]; +//#define TNV_NEW_STYLE                 +#ifdef TNV_NEW_STYLE                 +                qx_upd[l + k] = qx[l + k] + sigma * ubarx; +                qy_upd[l + k] = qy[l + k] + sigma * ubary; + +                float vx = divsigma * qx_upd[l + k]; //+ ubarx +                float vy = divsigma * qy_upd[l + k]; //+ ubary +#else +                float vx = ubarx + divsigma * qx[l + k]; +                float vy = ubary + divsigma * qy[l + k]; +#endif + +                M1 += (vx * vx); M2 += (vx * vy); M3 += (vy * vy); +            } + +            coefF(t, M1, M2, M3, sigma, p, q, r); +             +//#pragma vector aligned +#pragma GCC ivdep  +            for(k = 0; k < dimZ; k++) { +#ifdef TNV_NEW_STYLE     +                float vx = divsigma * qx_upd[l + k]; +                float vy = divsigma * qy_upd[l + k]; + +                float gx_upd = vx * t[0] + vy * t[1]; +                float gy_upd = vx * t[1] + vy * t[2]; + +                qx_upd[l + k] -= sigma * gx_upd; +                qy_upd[l + k] -= sigma * gy_upd; +#else +                float ubarx = theta1 * gradx_upd[l + k] - theta * gradx[l + k]; +                float ubary = theta1 * grady_upd[l + k] - theta * grady[l + k]; +                float vx = ubarx + divsigma * qx[l + k]; +                float vy = ubary + divsigma * qy[l + k]; + +                float gx_upd = vx * t[0] + vy * t[1]; +                float gy_upd = vx * t[1] + vy * t[2]; + +                qx_upd[l + k] = qx[l + k] + sigma * (ubarx - gx_upd); +                qy_upd[l + k] = qy[l + k] + sigma * (ubary - gy_upd); +#endif + +                float div_upd_val = 0; +#ifndef TNV_LOOP_FIRST_I +                div_upd_val -= qx_upd[l + k - padZ]; +#endif + +#ifndef TNV_LOOP_FIRST_J +                div_upd_val -= qy_upd[l + k - dimX * padZ]; +#endif +#ifndef TNV_LOOP_LAST_I +                div_upd_val += qx_upd[l + k]; +#endif +#ifndef TNV_LOOP_LAST_J +                div_upd_val += qy_upd[l + k]; +#endif +                div_upd[l + k] = div_upd_val; + +                qxdiff = qx[l + k] - qx_upd[l + k]; +                qydiff = qy[l + k] - qy_upd[l + k]; +                qnorm += (qxdiff * qxdiff + qydiff * qydiff); + +                resdual1 += fabs(divsigma * qxdiff - gradxdiff[k]); +                resdual2 += fabs(divsigma * qydiff - gradydiff[k]); +                product += (gradxdiff[k] * qxdiff + gradydiff[k] * qydiff); + +#ifndef TNV_LOOP_FIRST_J +                divdiff = div[l + k] - div_upd[l + k];  // Multiple steps... How we compute without history? +                resprimal += fabs(divtau * udiff[k] + divdiff);  +#endif +            } diff --git a/src/Core/regularisers_CPU/TNV_core_loop.h b/src/Core/regularisers_CPU/TNV_core_loop.h new file mode 100644 index 0000000..34e7139 --- /dev/null +++ b/src/Core/regularisers_CPU/TNV_core_loop.h @@ -0,0 +1,107 @@ +    { +            float t[3]; +            float M1 = 0.0f, M2 = 0.0f, M3 = 0.0f; +            l = (j * dimX  + i) * padZ; +            m = dimX * padZ; +         +            float *__restrict u_next = u + l + padZ; +            float *__restrict u_current = u + l; +            float *__restrict u_next_row = u + l + m; + + +            float *__restrict qx_current = qx + l; +            float *__restrict qy_current = qy + l; +            float *__restrict qx_prev = qx + l - padZ; +            float *__restrict qy_prev = qy + l - m; + + +//  __assume(padZ%16==0); + +//#pragma vector aligned +#pragma GCC ivdep  +            for(k = 0; k < dimZ; k++) { +                float u_upd = (u[l + k] + tau * div[l + k] + taulambda * Input[l + k]) / constant; // 3 reads +                udiff[k] = u[l + k] - u_upd; // cache 1w +                u[l + k] = u_upd; // 1 write + +#ifdef TNV_LOOP_FIRST_J +                udiff0[l + k] = udiff[k]; +                div0[l + k] = div[l + k]; +#endif + +#ifdef TNV_LOOP_LAST_I +                float gradx_upd = 0; +#else +                float u_next_upd = (u[l + k + padZ] + tau * div[l + k + padZ] + taulambda * Input[l + k + padZ]) / constant; // 3 reads +                float gradx_upd = (u_next_upd - u_upd); // 2 reads +#endif + +#ifdef TNV_LOOP_LAST_J +                float grady_upd = 0; +#else +                float u_next_row_upd = (u[l + k + m] + tau * div[l + k + m] + taulambda * Input[l + k + m]) / constant; // 3 reads +                float grady_upd = (u_next_row_upd - u_upd); // 1 read +#endif + +                gradxdiff[k] = gradx[l + k] - gradx_upd; // 1 read, cache 1w +                gradydiff[k] = grady[l + k] - grady_upd; // 1 read, cache 1w +                gradx[l + k] = gradx_upd; // 1 write +                grady[l + k] = grady_upd; // 1 write +                 +                ubarx[k] = gradx_upd - theta * gradxdiff[k]; // cache 1w +                ubary[k] = grady_upd - theta * gradydiff[k]; // cache 1w + +                float vx = ubarx[k] + divsigma * qx[l + k]; // 1 read +                float vy = ubary[k] + divsigma * qy[l + k]; // 1 read + +                M1 += (vx * vx); M2 += (vx * vy); M3 += (vy * vy); +            } + +            coefF(t, M1, M2, M3, sigma, p, q, r); +             +//#pragma vector aligned +#pragma GCC ivdep  +            for(k = 0; k < padZ; k++) { +                float vx = ubarx[k] + divsigma * qx_current[k]; // cache 2r +                float vy = ubary[k] + divsigma * qy_current[k]; // cache 2r +                float gx_upd = vx * t[0] + vy * t[1]; +                float gy_upd = vx * t[1] + vy * t[2]; + +                qxdiff = sigma * (ubarx[k] - gx_upd); +                qydiff = sigma * (ubary[k] - gy_upd); +                 +                qx_current[k] += qxdiff; // write 1 +                qy_current[k] += qydiff; // write 1 + +                unorm += (udiff[k] * udiff[k]); +                qnorm += (qxdiff * qxdiff + qydiff * qydiff); + +                float div_upd = 0; + +#ifndef TNV_LOOP_FIRST_I +                div_upd -= qx_prev[k]; // 1 read +#endif +#ifndef TNV_LOOP_FIRST_J +                div_upd -= qy_prev[k]; // 1 read +#endif +#ifndef TNV_LOOP_LAST_I +                div_upd += qx_current[k];  +#endif +#ifndef TNV_LOOP_LAST_J +                div_upd += qy_current[k];  +#endif + +                divdiff = div[l + k] - div_upd;  // 1 read +                div[l + k] = div_upd; // 1 write + +#ifndef TNV_LOOP_FIRST_J +                resprimal += fabs(divtau * udiff[k] + divdiff);  +#endif + +                    // We need to have two independent accumulators to allow gcc-autovectorization +                resdual1 += fabs(divsigma * qxdiff + gradxdiff[k]); // cache 1r +                resdual2 += fabs(divsigma * qydiff + gradydiff[k]); // cache 1r +                product -= (gradxdiff[k] * qxdiff + gradydiff[k] * qydiff); +            } +    } +    
\ No newline at end of file | 
