From accc4439d9dd035765ed77d94a0ceece3270cc0b Mon Sep 17 00:00:00 2001
From: "Suren A. Chilingaryan" <csa@ipecompute4.ands.kit.edu>
Date: Tue, 26 Jul 2022 23:32:36 +0200
Subject: Half-precision back-/forward-projection for parallel geometry

---
 cuda/3d/par3d_bp.cu | 21 ++++++++++++++++++---
 cuda/3d/par3d_fp.cu | 19 ++++++++++++++++++-
 cuda/3d/rounding.h  | 11 +++++++++++
 3 files changed, 47 insertions(+), 4 deletions(-)
 create mode 100644 cuda/3d/rounding.h

(limited to 'cuda')

diff --git a/cuda/3d/par3d_bp.cu b/cuda/3d/par3d_bp.cu
index e43479a..75cbf2b 100644
--- a/cuda/3d/par3d_bp.cu
+++ b/cuda/3d/par3d_bp.cu
@@ -75,7 +75,7 @@ static bool bindProjDataTexture(const cudaArray* array)
 	return true;
 }
 
-
+#include "rounding.h"
 __global__ void dev_par3D_BP(void* D_volData, unsigned int volPitch, int startAngle, int angleOffset, const SDimensions3D dims, float fOutputScale)
 {
 	float* volData = (float*)D_volData;
@@ -122,8 +122,23 @@ __global__ void dev_par3D_BP(void* D_volData, unsigned int volPitch, int startAn
 			float fV = fCv.w + fX * fCv.x + fY * fCv.y + fZ * fCv.z;
 
 			for (int idx = 0; idx < ZSIZE; ++idx) {
-
-				float fVal = tex3D(gT_par3DProjTexture, fU, fAngle, fV);
+				float fVal;
+				textype h5 = texto(0.5f);
+				textype fU_ = texto(fU);
+				textype fUf_ = texto(floor(fU));
+				float fUf = floor(fU);
+				
+				if ((fU - fUf) < 0.5f) {
+				    textype fVal1 = texto(tex3D(gT_par3DProjTexture, fUf - 0.5f, fAngle, fV));
+				    textype fVal2 = texto(tex3D(gT_par3DProjTexture, fUf + 0.5f, fAngle, fV));
+				    fVal = texfrom(fVal1 + (fU_ + h5 - fUf_) * (fVal2 - fVal1));
+				} else {
+				    textype fVal1 = texto(tex3D(gT_par3DProjTexture, fUf + 0.5f, fAngle, fV));
+				    textype fVal2 = texto(tex3D(gT_par3DProjTexture, fUf + 1.5f, fAngle, fV));
+				    fVal = texfrom(fVal1 + (fU_ - h5 - fUf_) * (fVal2 - fVal1));
+				}
+
+//				float fVal = tex3D(gT_par3DProjTexture, fU, fAngle, fV);
 				Z[idx] += fVal;
 
 				fU += fCu.z;
diff --git a/cuda/3d/par3d_fp.cu b/cuda/3d/par3d_fp.cu
index a99308f..3ad9f0d 100644
--- a/cuda/3d/par3d_fp.cu
+++ b/cuda/3d/par3d_fp.cu
@@ -146,6 +146,7 @@ struct SCALE_NONCUBE {
 // blockIdx:  x = u/v detector
 //            y = angle block
 
+#include "rounding.h"
 
 template<class COORD, class SCALE>
 __global__ void par3D_FP_t(float* D_projData, unsigned int projPitch,
@@ -212,7 +213,23 @@ __global__ void par3D_FP_t(float* D_projData, unsigned int projPitch,
 
 		for (int s = startSlice; s < endSlice; ++s)
 		{
-			fVal += c.tex(f0, f1, f2);
+			textype h5 = texto(0.5f);
+			textype f1_ = texto(f1);
+			textype f1f_ = texto(floor(f1));
+			float f1f = floor(f1);
+				
+			if ((f1 - f1f) < 0.5f) {
+			    textype fVal1 = texto(c.tex(f0, f1f - 0.5f, f2));
+			    textype fVal2 = texto(c.tex(f0, f1f + 0.5f, f2));
+			    fVal += texfrom(fVal1 + (f1_ + h5 - f1f_) * (fVal2 - fVal1));
+//			    fVal += texfrom(__hfma(__hadd(h5,__hsub(f1_, f1f_)), __hsub(fVal2, fVal1), fVal1));
+			} else {
+			    textype fVal1 = texto(c.tex(f0, f1f + 0.5f, f2));
+			    textype fVal2 = texto(c.tex(f0, f1f + 1.5f, f2));
+			    fVal += texfrom(fVal1 + (f1_ - h5 - f1f_) * (fVal2 - fVal1));
+			}
+
+//			fVal += c.tex(f0, f1, f2);
 			f0 += 1.0f;
 			f1 += a1;
 			f2 += a2;
diff --git a/cuda/3d/rounding.h b/cuda/3d/rounding.h
new file mode 100644
index 0000000..b7ccc1c
--- /dev/null
+++ b/cuda/3d/rounding.h
@@ -0,0 +1,11 @@
+#include <cuda_fp16.h>
+
+#define texto __float2half
+#define texfrom __half2float
+#define textype half
+
+
+//#define texto
+//#define texfrom
+//#define textype float
+
-- 
cgit v1.2.3