diff options
| -rw-r--r-- | cuda/2d/astra.cu | 2 | ||||
| -rw-r--r-- | cuda/2d/cgls.cu | 8 | ||||
| -rw-r--r-- | cuda/2d/em.cu | 4 | ||||
| -rw-r--r-- | cuda/2d/sart.cu | 8 | ||||
| -rw-r--r-- | cuda/2d/sirt.cu | 8 | ||||
| -rw-r--r-- | cuda/2d/util.cu | 10 | ||||
| -rw-r--r-- | cuda/2d/util.h | 4 | 
7 files changed, 28 insertions, 16 deletions
diff --git a/cuda/2d/astra.cu b/cuda/2d/astra.cu index 15e487c..f4d4717 100644 --- a/cuda/2d/astra.cu +++ b/cuda/2d/astra.cu @@ -612,7 +612,7 @@ float BPalgo::computeDiffNorm()  	allocateProjectionData(D_projData, projPitch, dims); -	cudaMemcpy2D(D_projData, sizeof(float)*projPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*dims.iProjDets, dims.iProjAngles, cudaMemcpyDeviceToDevice); +	duplicateProjectionData(D_projData, D_sinoData, sinoPitch, dims);  	callFP(D_volumeData, volumePitch, D_projData, projPitch, -1.0f);  	float s = dotProduct2D(D_projData, projPitch, dims.iProjDets, dims.iProjAngles); diff --git a/cuda/2d/cgls.cu b/cuda/2d/cgls.cu index fce8beb..066ac5d 100644 --- a/cuda/2d/cgls.cu +++ b/cuda/2d/cgls.cu @@ -120,12 +120,12 @@ bool CGLS::iterate(unsigned int iterations)  	if (!sliceInitialized) {  		// copy sinogram -		cudaMemcpy2D(D_r, sizeof(float)*rPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets), dims.iProjAngles, cudaMemcpyDeviceToDevice); +		duplicateProjectionData(D_r, D_sinoData, sinoPitch, dims);  		// r = sino - A*x  		if (useVolumeMask) {  			// Use z as temporary storage here since it is unused -			cudaMemcpy2D(D_z, sizeof(float)*zPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth), dims.iVolHeight, cudaMemcpyDeviceToDevice); +			duplicateVolumeData(D_z, D_volumeData, volumePitch, dims);  			processVol<opMul>(D_z, D_maskData, zPitch, dims);  			callFP(D_z, zPitch, D_r, rPitch, -1.0f);  		} else { @@ -189,11 +189,11 @@ float CGLS::computeDiffNorm()  	// used outside of iterations.  	// copy sinogram to w -	cudaMemcpy2D(D_w, sizeof(float)*wPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets), dims.iProjAngles, cudaMemcpyDeviceToDevice); +	duplicateProjectionData(D_w, D_sinoData, sinoPitch, dims);  	// do FP, subtracting projection from sinogram  	if (useVolumeMask) { -			cudaMemcpy2D(D_z, sizeof(float)*zPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth), dims.iVolHeight, cudaMemcpyDeviceToDevice); +			duplicateVolumeData(D_z, D_volumeData, volumePitch, dims);  			processVol<opMul>(D_z, D_maskData, zPitch, dims);  			callFP(D_z, zPitch, D_w, wPitch, -1.0f);  	} else { diff --git a/cuda/2d/em.cu b/cuda/2d/em.cu index c75f250..ebb76b5 100644 --- a/cuda/2d/em.cu +++ b/cuda/2d/em.cu @@ -150,11 +150,11 @@ bool EM::iterate(unsigned int iterations)  float EM::computeDiffNorm()  {  	// copy sinogram to projection data -	cudaMemcpy2D(D_projData, sizeof(float)*projPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets), dims.iProjAngles, cudaMemcpyDeviceToDevice); +	duplicateProjectionData(D_projData, D_sinoData, sinoPitch, dims);  	// do FP, subtracting projection from sinogram  	if (useVolumeMask) { -			cudaMemcpy2D(D_tmpData, sizeof(float)*tmpPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth), dims.iVolHeight, cudaMemcpyDeviceToDevice); +			duplicateVolumeData(D_tmpData, D_volumeData, volumePitch, dims);  			processVol<opMul>(D_tmpData, D_maskData, tmpPitch, dims);  			callFP(D_tmpData, tmpPitch, D_projData, projPitch, -1.0f);  	} else { diff --git a/cuda/2d/sart.cu b/cuda/2d/sart.cu index 048661f..64d6f28 100644 --- a/cuda/2d/sart.cu +++ b/cuda/2d/sart.cu @@ -180,11 +180,11 @@ bool SART::iterate(unsigned int iterations)  		}  		// copy one line of sinogram to projection data -		cudaMemcpy2D(D_projData, sizeof(float)*projPitch, D_sinoData + angle*sinoPitch, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets), 1, cudaMemcpyDeviceToDevice); +		duplicateProjectionData(D_projData, D_sinoData, sinoPitch, dims);  		// do FP, subtracting projection from sinogram  		if (useVolumeMask) { -				cudaMemcpy2D(D_tmpData, sizeof(float)*tmpPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth), dims.iVolHeight, cudaMemcpyDeviceToDevice); +				duplicateVolumeData(D_tmpData, D_volumeData, volumePitch, dims);  				processVol<opMul>(D_tmpData, D_maskData, tmpPitch, dims);  				callFP_SART(D_tmpData, tmpPitch, D_projData, projPitch, angle, -1.0f);  		} else { @@ -223,11 +223,11 @@ float SART::computeDiffNorm()  	zeroProjectionData(D_p, pPitch, dims);  	// copy sinogram to D_p -	cudaMemcpy2D(D_p, sizeof(float)*pPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets), dims.iProjAngles, cudaMemcpyDeviceToDevice); +	duplicateProjectionData(D_p, D_sinoData, sinoPitch, dims);  	// do FP, subtracting projection from sinogram  	if (useVolumeMask) { -			cudaMemcpy2D(D_tmpData, sizeof(float)*tmpPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth), dims.iVolHeight, cudaMemcpyDeviceToDevice); +			duplicateVolumeData(D_tmpData, D_volumeData, volumePitch, dims);  			processVol<opMul>(D_tmpData, D_maskData, tmpPitch, dims);  			callFP(D_tmpData, tmpPitch, D_projData, projPitch, -1.0f);  	} else { diff --git a/cuda/2d/sirt.cu b/cuda/2d/sirt.cu index c402864..d34a180 100644 --- a/cuda/2d/sirt.cu +++ b/cuda/2d/sirt.cu @@ -191,11 +191,11 @@ bool SIRT::iterate(unsigned int iterations)  	for (unsigned int iter = 0; iter < iterations && !shouldAbort; ++iter) {  		// copy sinogram to projection data -		cudaMemcpy2D(D_projData, sizeof(float)*projPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets), dims.iProjAngles, cudaMemcpyDeviceToDevice); +		duplicateProjectionData(D_projData, D_sinoData, projPitch, dims);  		// do FP, subtracting projection from sinogram  		if (useVolumeMask) { -				cudaMemcpy2D(D_tmpData, sizeof(float)*tmpPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth), dims.iVolHeight, cudaMemcpyDeviceToDevice); +				duplicateVolumeData(D_tmpData, D_volumeData, volumePitch, dims);  				processVol<opMul>(D_tmpData, D_maskData, tmpPitch, dims);  				callFP(D_tmpData, tmpPitch, D_projData, projPitch, -1.0f);  		} else { @@ -226,11 +226,11 @@ bool SIRT::iterate(unsigned int iterations)  float SIRT::computeDiffNorm()  {  	// copy sinogram to projection data -	cudaMemcpy2D(D_projData, sizeof(float)*projPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets), dims.iProjAngles, cudaMemcpyDeviceToDevice); +	duplicateProjectionData(D_projData, D_sinoData, projPitch, dims);  	// do FP, subtracting projection from sinogram  	if (useVolumeMask) { -			cudaMemcpy2D(D_tmpData, sizeof(float)*tmpPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth), dims.iVolHeight, cudaMemcpyDeviceToDevice); +			duplicateVolumeData(D_tmpData, D_volumeData, volumePitch, dims);  			processVol<opMul>(D_tmpData, D_maskData, tmpPitch, dims);  			callFP(D_tmpData, tmpPitch, D_projData, projPitch, -1.0f);  	} else { diff --git a/cuda/2d/util.cu b/cuda/2d/util.cu index 8d3b625..dba70d9 100644 --- a/cuda/2d/util.cu +++ b/cuda/2d/util.cu @@ -129,6 +129,15 @@ void zeroProjectionData(float* D_ptr, unsigned int pitch, const SDimensions& dim  	zeroVolume(D_ptr, pitch, dims.iProjDets, dims.iProjAngles);  } +void duplicateVolumeData(float* D_dst, float* D_src, unsigned int pitch, const SDimensions& dims) +{ +	cudaMemcpy2D(D_dst, sizeof(float)*pitch, D_src, sizeof(float)*pitch, sizeof(float)*dims.iVolWidth, dims.iVolHeight, cudaMemcpyDeviceToDevice); +} + +void duplicateProjectionData(float* D_dst, float* D_src, unsigned int pitch, const SDimensions& dims) +{ +	cudaMemcpy2D(D_dst, sizeof(float)*pitch, D_src, sizeof(float)*pitch, sizeof(float)*dims.iProjDets, dims.iProjAngles, cudaMemcpyDeviceToDevice); +}  template <unsigned int blockSize>  __global__ void reduce1D(float *g_idata, float *g_odata, unsigned int n) @@ -206,7 +215,6 @@ __global__ void reduce2D(float *g_idata, float *g_odata,  float dotProduct2D(float* D_data, unsigned int pitch,                     unsigned int width, unsigned int height)  { -#warning FIX MEMORY ORDER  	unsigned int bx = (width + 15) / 16;  	unsigned int by = (height + 127) / 128;  	unsigned int shared_mem2 = sizeof(float) * 16 * 16; diff --git a/cuda/2d/util.h b/cuda/2d/util.h index 83cb794..c0ec49e 100644 --- a/cuda/2d/util.h +++ b/cuda/2d/util.h @@ -80,6 +80,10 @@ bool allocateProjectionData(float*& D_ptr, unsigned int& pitch, const SDimension  void zeroVolumeData(float* D_ptr, unsigned int pitch, const SDimensions& dims);  void zeroProjectionData(float* D_ptr, unsigned int pitch, const SDimensions& dims); +void duplicateVolumeData(float* D_dst, float* D_src, unsigned int pitch, const SDimensions& dims); +void duplicateProjectionData(float* D_dst, float* D_src, unsigned int pitch, const SDimensions& dims); + +  bool cudaTextForceKernelsCompletion();  void reportCudaError(cudaError_t err);  | 
