18 files changed, 760 insertions, 1335 deletions
diff --git a/cuda/2d/algo.cu b/cuda/2d/algo.cu
index 0e5209f..3275f6d 100644
--- a/cuda/2d/algo.cu
+++ b/cuda/2d/algo.cu
@@ -34,13 +34,13 @@ along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
 #include "fan_bp.h"
 #include "util.h"
 #include "arith.h"
+#include "astra.h"
 
 namespace astraCUDA {
 
 ReconAlgo::ReconAlgo()
 {
-	angles = 0;
-	TOffsets = 0;
+	parProjs = 0;
 	fanProjs = 0;
 	shouldAbort = false;
 
@@ -65,8 +65,7 @@ ReconAlgo::~ReconAlgo()
 
 void ReconAlgo::reset()
 {
-	delete[] angles;
-	delete[] TOffsets;
+	delete[] parProjs;
 	delete[] fanProjs;
 
 	if (freeGPUMemory) {
@@ -76,8 +75,7 @@ void ReconAlgo::reset()
 		cudaFree(D_volumeData);
 	}
 
-	angles = 0;
-	TOffsets = 0;
+	parProjs = 0;
 	fanProjs = 0;
 	shouldAbort = false;
 
@@ -123,46 +121,40 @@ bool ReconAlgo::enableSinogramMask()
 }
 
 
-bool ReconAlgo::setGeometry(const SDimensions& _dims, const float* _angles)
+bool ReconAlgo::setGeometry(const astra::CVolumeGeometry2D* pVolGeom,
+                            const astra::CProjectionGeometry2D* pProjGeom)
 {
-	dims = _dims;
+	bool ok;
 
-	angles = new float[dims.iProjAngles];
+	ok = convertAstraGeometry_dims(pVolGeom, pProjGeom, dims);
 
-	memcpy(angles, _angles, sizeof(angles[0]) * dims.iProjAngles);
+	if (!ok)
+		return false;
 
+	delete[] parProjs;
+	parProjs = 0;
 	delete[] fanProjs;
 	fanProjs = 0;
 
-	return true;
-}
-
-bool ReconAlgo::setFanGeometry(const SDimensions& _dims,
-                               const SFanProjection* _projs)
-{
-	dims = _dims;
-	fanProjs = new SFanProjection[dims.iProjAngles];
-
-	memcpy(fanProjs, _projs, sizeof(fanProjs[0]) * dims.iProjAngles);
-
-	delete[] angles;
-	angles = 0;
+	fOutputScale = 1.0f;
+	ok = convertAstraGeometry(pVolGeom, pProjGeom, parProjs, fanProjs, fOutputScale);
+	if (!ok)
+		return false;
 
 	return true;
 }
 
-
-bool ReconAlgo::setTOffsets(const float* _TOffsets)
+bool ReconAlgo::setSuperSampling(int raysPerDet, int raysPerPixelDim)
 {
-	// TODO: determine if they're all zero?
-	TOffsets = new float[dims.iProjAngles];
-	memcpy(TOffsets, _TOffsets, sizeof(angles[0]) * dims.iProjAngles);
+	if (raysPerDet <= 0 || raysPerPixelDim <= 0)
+		return false;
+
+	dims.iRaysPerDet = raysPerDet;
+	dims.iRaysPerPixelDim = raysPerPixelDim;
 
 	return true;
 }
 
-
-
 bool ReconAlgo::setVolumeMask(float* _D_maskData, unsigned int _maskPitch)
 {
 	assert(useVolumeMask);
@@ -323,14 +315,14 @@ bool ReconAlgo::callFP(float* D_volumeData, unsigned int volumePitch,
                        float* D_projData, unsigned int projPitch,
                        float outputScale)
 {
-	if (angles) {
+	if (parProjs) {
 		assert(!fanProjs);
 		return FP(D_volumeData, volumePitch, D_projData, projPitch,
-		          dims, angles, TOffsets, outputScale);
+		          dims, parProjs, fOutputScale * outputScale);
 	} else {
 		assert(fanProjs);
 		return FanFP(D_volumeData, volumePitch, D_projData, projPitch,
-		             dims, fanProjs, outputScale);
+		             dims, fanProjs, fOutputScale * outputScale);
 	}
 }
 
@@ -338,10 +330,10 @@ bool ReconAlgo::callBP(float* D_volumeData, unsigned int volumePitch,
                        float* D_projData, unsigned int projPitch,
                        float outputScale)
 {
-	if (angles) {
+	if (parProjs) {
 		assert(!fanProjs);
 		return BP(D_volumeData, volumePitch, D_projData, projPitch,
-		          dims, angles, TOffsets, outputScale);
+		          dims, parProjs, outputScale);
 	} else {
 		assert(fanProjs);
 		return FanBP(D_volumeData, volumePitch, D_projData, projPitch,
diff --git a/cuda/2d/algo.h b/cuda/2d/algo.h
index 4a75907..d70859f 100644
--- a/cuda/2d/algo.h
+++ b/cuda/2d/algo.h
@@ -31,6 +31,17 @@ along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
 #include "astra/Globals.h"
 #include "dims.h"
 
+namespace astra {
+
+class CParallelProjectionGeometry2D;
+class CParallelVecProjectionGeometry2D;
+class CFanFlatProjectionGeometry2D;
+class CFanFlatVecProjectionGeometry2D;
+class CVolumeGeometry2D;
+class CProjectionGeometry2D;
+
+}
+
 namespace astraCUDA {
 
 class _AstraExport ReconAlgo {
@@ -40,11 +51,10 @@ public:
 
 	bool setGPUIndex(int iGPUIndex);
 
-	bool setGeometry(const SDimensions& dims, const float* angles);
-	bool setFanGeometry(const SDimensions& dims, const SFanProjection* projs);
+	bool setGeometry(const astra::CVolumeGeometry2D* pVolGeom,
+	                 const astra::CProjectionGeometry2D* pProjGeom);
 
-	// setTOffsets should (optionally) be called after setGeometry
-	bool setTOffsets(const float* TOffsets);
+	bool setSuperSampling(int raysPerDet, int raysPerPixelDim);
 
 	void signalAbort() { shouldAbort = true; }
 
@@ -123,9 +133,9 @@ protected:
 
 
 	SDimensions dims;
-	float* angles;
-	float* TOffsets;
+	SParProjection* parProjs;
 	SFanProjection* fanProjs;
+	float fOutputScale;
 
 	volatile bool shouldAbort;
 
diff --git a/cuda/2d/astra.cu b/cuda/2d/astra.cu
index c0132b2..81459de 100644
--- a/cuda/2d/astra.cu
+++ b/cuda/2d/astra.cu
@@ -41,8 +41,10 @@ along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
 #include <fstream>
 #include <cuda.h>
 
+#include "../../include/astra/GeometryUtil2D.h"
 #include "../../include/astra/VolumeGeometry2D.h"
 #include "../../include/astra/ParallelProjectionGeometry2D.h"
+#include "../../include/astra/ParallelVecProjectionGeometry2D.h"
 #include "../../include/astra/FanFlatProjectionGeometry2D.h"
 #include "../../include/astra/FanFlatVecProjectionGeometry2D.h"
 
@@ -63,514 +65,6 @@ enum CUDAProjectionType {
 };
 
 
-class AstraFBP_internal {
-public:
-	SDimensions dims;
-	float* angles;
-	float* TOffsets;
-	astraCUDA::SFanProjection* fanProjections;
-
-	float fOriginSourceDistance;
-	float fOriginDetectorDistance;
-
-	float fPixelSize;
-
-	bool bFanBeam;
-	bool bShortScan;
-
-	bool initialized;
-	bool setStartReconstruction;
-
-	float* D_sinoData;
-	unsigned int sinoPitch;
-
-	float* D_volumeData;
-	unsigned int volumePitch;
-
-	cufftComplex * m_pDevFilter;
-};
-
-AstraFBP::AstraFBP()
-{
-	pData = new AstraFBP_internal();
-
-	pData->angles = 0;
-	pData->fanProjections = 0;
-	pData->TOffsets = 0;
-	pData->D_sinoData = 0;
-	pData->D_volumeData = 0;
-
-	pData->dims.iVolWidth = 0;
-	pData->dims.iProjAngles = 0;
-	pData->dims.fDetScale = 1.0f;
-	pData->dims.iRaysPerDet = 1;
-	pData->dims.iRaysPerPixelDim = 1;
-
-	pData->initialized = false;
-	pData->setStartReconstruction = false;
-
-	pData->m_pDevFilter = NULL;
-}
-
-AstraFBP::~AstraFBP()
-{
-	delete[] pData->angles;
-	pData->angles = 0;
-
-	delete[] pData->TOffsets;
-	pData->TOffsets = 0;
-
-	delete[] pData->fanProjections;
-	pData->fanProjections = 0;
-
-	cudaFree(pData->D_sinoData);
-	pData->D_sinoData = 0;
-
-	cudaFree(pData->D_volumeData);
-	pData->D_volumeData = 0;
-
-	if(pData->m_pDevFilter != NULL)
-	{
-		freeComplexOnDevice(pData->m_pDevFilter);
-		pData->m_pDevFilter = NULL;
-	}
-
-	delete pData;
-	pData = 0;
-}
-
-bool AstraFBP::setReconstructionGeometry(unsigned int iVolWidth,
-                                          unsigned int iVolHeight,
-                                          float fPixelSize)
-{
-	if (pData->initialized)
-		return false;
-
-	pData->dims.iVolWidth = iVolWidth;
-	pData->dims.iVolHeight = iVolHeight;
-
-	pData->fPixelSize = fPixelSize;
-
-	return (iVolWidth > 0 && iVolHeight > 0 && fPixelSize > 0.0f);
-}
-
-bool AstraFBP::setProjectionGeometry(unsigned int iProjAngles,
-                                      unsigned int iProjDets,
-                                      const float* pfAngles,
-                                      float fDetSize)
-{
-	if (pData->initialized)
-		return false;
-
-	pData->dims.iProjAngles = iProjAngles;
-	pData->dims.iProjDets = iProjDets;
-	pData->dims.fDetScale = fDetSize / pData->fPixelSize;
-
-	if (iProjAngles == 0 || iProjDets == 0 || pfAngles == 0)
-		return false;
-
-	pData->angles = new float[iProjAngles];
-	memcpy(pData->angles, pfAngles, iProjAngles * sizeof(pfAngles[0]));
-
-	pData->bFanBeam = false;
-
-	return true;
-}
-
-bool AstraFBP::setFanGeometry(unsigned int iProjAngles,
-                              unsigned int iProjDets,
-                              const astraCUDA::SFanProjection *fanProjs,
-                              const float* pfAngles,
-                              float fOriginSourceDistance,
-                              float fOriginDetectorDistance,
-                              float fDetSize,
-                              bool bShortScan)
-{
-	// Slightly abusing setProjectionGeometry for this...
-	if (!setProjectionGeometry(iProjAngles, iProjDets, pfAngles, fDetSize))
-		return false;
-
-	pData->fOriginSourceDistance = fOriginSourceDistance;
-	pData->fOriginDetectorDistance = fOriginDetectorDistance;
-
-	pData->fanProjections = new astraCUDA::SFanProjection[iProjAngles];
-	memcpy(pData->fanProjections, fanProjs, iProjAngles * sizeof(fanProjs[0]));
-
-	pData->bFanBeam = true;
-	pData->bShortScan = bShortScan;
-
-	return true;
-}
-
-
-bool AstraFBP::setPixelSuperSampling(unsigned int iPixelSuperSampling)
-{
-	if (pData->initialized)
-		return false;
-
-	if (iPixelSuperSampling == 0)
-		return false;
-
-	pData->dims.iRaysPerPixelDim = iPixelSuperSampling;
-
-	return true;
-}
-
-
-bool AstraFBP::setTOffsets(const float* pfTOffsets)
-{
-	if (pData->initialized)
-		return false;
-
-	if (pfTOffsets == 0)
-		return false;
-
-	pData->TOffsets = new float[pData->dims.iProjAngles];
-	memcpy(pData->TOffsets, pfTOffsets, pData->dims.iProjAngles * sizeof(pfTOffsets[0]));
-
-	return true;
-}
-
-bool AstraFBP::init(int iGPUIndex)
-{
-	if (pData->initialized)
-	{
-		return false;
-	}
-
-	if (pData->dims.iProjAngles == 0 || pData->dims.iVolWidth == 0)
-	{
-		return false;
-	}
-
-	if (iGPUIndex != -1) {
-		cudaSetDevice(iGPUIndex);
-		cudaError_t err = cudaGetLastError();
-
-		// Ignore errors caused by calling cudaSetDevice multiple times
-		if (err != cudaSuccess && err != cudaErrorSetOnActiveProcess)
-		{
-			return false;
-		}
-	}
-
-	bool ok = allocateVolumeData(pData->D_volumeData, pData->volumePitch, pData->dims);
-	if (!ok)
-	{
-		return false;
-	}
-
-	ok = allocateProjectionData(pData->D_sinoData, pData->sinoPitch, pData->dims);
-	if (!ok)
-	{
-		cudaFree(pData->D_volumeData);
-		pData->D_volumeData = 0;
-		return false;
-	}
-
-	pData->initialized = true;
-
-	return true;
-}
-
-bool AstraFBP::setSinogram(const float* pfSinogram,
-                            unsigned int iSinogramPitch)
-{
-	if (!pData->initialized)
-		return false;
-	if (!pfSinogram)
-		return false;
-
-	bool ok = copySinogramToDevice(pfSinogram, iSinogramPitch,
-	                               pData->dims,
-	                               pData->D_sinoData, pData->sinoPitch);
-	if (!ok)
-		return false;
-
-	// rescale sinogram to adjust for pixel size
-	processSino<opMul>(pData->D_sinoData,
-	                       1.0f/(pData->fPixelSize*pData->fPixelSize),
-	                       pData->sinoPitch, pData->dims);
-
-	pData->setStartReconstruction = false;
-
-	return true;
-}
-
-static int calcNextPowerOfTwo(int _iValue)
-{
-	int iOutput = 1;
-
-	while(iOutput < _iValue)
-	{
-		iOutput *= 2;
-	}
-
-	return iOutput;
-}
-
-bool AstraFBP::run()
-{
-	if (!pData->initialized)
-	{
-		return false;
-	}
-
-	zeroVolumeData(pData->D_volumeData, pData->volumePitch, pData->dims);
-
-	bool ok = false;
-
-	if (pData->bFanBeam) {
-		// Call FDK_PreWeight to handle fan beam geometry. We treat
-		// this as a cone beam setup of a single slice:
-
-		// TODO: TOffsets affects this preweighting...
-
-		// We create a fake cudaPitchedPtr
-		cudaPitchedPtr tmp;
-		tmp.ptr = pData->D_sinoData;
-		tmp.pitch = pData->sinoPitch * sizeof(float);
-		tmp.xsize = pData->dims.iProjDets;
-		tmp.ysize = pData->dims.iProjAngles;
-		// and a fake Dimensions3D
-		astraCUDA3d::SDimensions3D dims3d;
-		dims3d.iVolX = pData->dims.iVolWidth;
-		dims3d.iVolY = pData->dims.iVolHeight;
-		dims3d.iVolZ = 1;
-		dims3d.iProjAngles = pData->dims.iProjAngles;
-		dims3d.iProjU = pData->dims.iProjDets;
-		dims3d.iProjV = 1;
-
-		astraCUDA3d::FDK_PreWeight(tmp, pData->fOriginSourceDistance,
-		              pData->fOriginDetectorDistance, 0.0f,
-		              pData->dims.fDetScale, 1.0f, // TODO: Are these correct?
-		              pData->bShortScan, dims3d, pData->angles);
-	}
-
-	if (pData->m_pDevFilter) {
-
-		int iFFTRealDetCount = calcNextPowerOfTwo(2 * pData->dims.iProjDets);
-		int iFFTFourDetCount = calcFFTFourSize(iFFTRealDetCount);
-
-		cufftComplex * pDevComplexSinogram = NULL;
-
-		allocateComplexOnDevice(pData->dims.iProjAngles, iFFTFourDetCount, &pDevComplexSinogram);
-
-		runCudaFFT(pData->dims.iProjAngles, pData->D_sinoData, pData->sinoPitch, pData->dims.iProjDets, iFFTRealDetCount, iFFTFourDetCount, pDevComplexSinogram);
-
-		applyFilter(pData->dims.iProjAngles, iFFTFourDetCount, pDevComplexSinogram, pData->m_pDevFilter);
-
-		runCudaIFFT(pData->dims.iProjAngles, pDevComplexSinogram, pData->D_sinoData, pData->sinoPitch, pData->dims.iProjDets, iFFTRealDetCount, iFFTFourDetCount);
-
-		freeComplexOnDevice(pDevComplexSinogram);
-
-	}
-
-	float fOutputScale = (M_PI / 2.0f) / (float)pData->dims.iProjAngles;
-
-	if (pData->bFanBeam) {
-		ok = FanBP_FBPWeighted(pData->D_volumeData, pData->volumePitch, pData->D_sinoData, pData->sinoPitch, pData->dims, pData->fanProjections, fOutputScale);
-
-	} else {
-		ok = BP(pData->D_volumeData, pData->volumePitch, pData->D_sinoData, pData->sinoPitch, pData->dims, pData->angles, pData->TOffsets, fOutputScale);
-	}
-	if(!ok)
-	{
-		return false;
-	}
-
-	return true;
-}
-
-bool AstraFBP::getReconstruction(float* pfReconstruction, unsigned int iReconstructionPitch) const
-{
-	if (!pData->initialized)
-		return false;
-
-	bool ok = copyVolumeFromDevice(pfReconstruction, iReconstructionPitch,
-	                               pData->dims,
-	                               pData->D_volumeData, pData->volumePitch);
-	if (!ok)
-		return false;
-
-	return true;
-}
-
-int AstraFBP::calcFourierFilterSize(int _iDetectorCount)
-{
-	int iFFTRealDetCount = calcNextPowerOfTwo(2 * _iDetectorCount);
-	int iFreqBinCount = calcFFTFourSize(iFFTRealDetCount);
-
-	// CHECKME: Matlab makes this at least 64. Do we also need to?
-	return iFreqBinCount;
-}
-
-bool AstraFBP::setFilter(E_FBPFILTER _eFilter, const float * _pfHostFilter /* = NULL */, int _iFilterWidth /* = 0 */, float _fD /* = 1.0f */, float _fFilterParameter /* = -1.0f */)
-{
-	if(pData->m_pDevFilter != 0)
-	{
-		freeComplexOnDevice(pData->m_pDevFilter);
-		pData->m_pDevFilter = 0;
-	}
-
-	if (_eFilter == FILTER_NONE)
-		return true; // leave pData->m_pDevFilter set to 0
-
-
-	int iFFTRealDetCount = calcNextPowerOfTwo(2 * pData->dims.iProjDets);
-	int iFreqBinCount = calcFFTFourSize(iFFTRealDetCount);
-
-	cufftComplex * pHostFilter = new cufftComplex[pData->dims.iProjAngles * iFreqBinCount];
-	memset(pHostFilter, 0, sizeof(cufftComplex) * pData->dims.iProjAngles * iFreqBinCount);
-
-	allocateComplexOnDevice(pData->dims.iProjAngles, iFreqBinCount, &(pData->m_pDevFilter));
-
-	switch(_eFilter)
-	{
-		case FILTER_NONE:
-			// handled above
-			break;
-		case FILTER_RAMLAK:
-		case FILTER_SHEPPLOGAN:
-		case FILTER_COSINE:
-		case FILTER_HAMMING:
-		case FILTER_HANN:
-		case FILTER_TUKEY:
-		case FILTER_LANCZOS:
-		case FILTER_TRIANGULAR:
-		case FILTER_GAUSSIAN:
-		case FILTER_BARTLETTHANN:
-		case FILTER_BLACKMAN:
-		case FILTER_NUTTALL:
-		case FILTER_BLACKMANHARRIS:
-		case FILTER_BLACKMANNUTTALL:
-		case FILTER_FLATTOP:
-		case FILTER_PARZEN:
-		{
-			genFilter(_eFilter, _fD, pData->dims.iProjAngles, pHostFilter, iFFTRealDetCount, iFreqBinCount, _fFilterParameter);
-			uploadComplexArrayToDevice(pData->dims.iProjAngles, iFreqBinCount, pHostFilter, pData->m_pDevFilter);
-
-			break;
-		}
-		case FILTER_PROJECTION:
-		{
-			// make sure the offered filter has the correct size
-			assert(_iFilterWidth == iFreqBinCount);
-
-			for(int iFreqBinIndex = 0; iFreqBinIndex < iFreqBinCount; iFreqBinIndex++)
-			{
-				float fValue = _pfHostFilter[iFreqBinIndex];
-
-				for(int iProjectionIndex = 0; iProjectionIndex < (int)pData->dims.iProjAngles; iProjectionIndex++)
-				{
-					pHostFilter[iFreqBinIndex + iProjectionIndex * iFreqBinCount].x = fValue;
-					pHostFilter[iFreqBinIndex + iProjectionIndex * iFreqBinCount].y = 0.0f;
-				}
-			}
-			uploadComplexArrayToDevice(pData->dims.iProjAngles, iFreqBinCount, pHostFilter, pData->m_pDevFilter);
-			break;
-		}
-		case FILTER_SINOGRAM:
-		{
-			// make sure the offered filter has the correct size
-			assert(_iFilterWidth == iFreqBinCount);
-
-			for(int iFreqBinIndex = 0; iFreqBinIndex < iFreqBinCount; iFreqBinIndex++)
-			{
-				for(int iProjectionIndex = 0; iProjectionIndex < (int)pData->dims.iProjAngles; iProjectionIndex++)
-				{
-					float fValue = _pfHostFilter[iFreqBinIndex + iProjectionIndex * _iFilterWidth];
-
-					pHostFilter[iFreqBinIndex + iProjectionIndex * iFreqBinCount].x = fValue;
-					pHostFilter[iFreqBinIndex + iProjectionIndex * iFreqBinCount].y = 0.0f;
-				}
-			}
-			uploadComplexArrayToDevice(pData->dims.iProjAngles, iFreqBinCount, pHostFilter, pData->m_pDevFilter);
-			break;
-		}
-		case FILTER_RPROJECTION:
-		{
-			int iProjectionCount = pData->dims.iProjAngles;
-			int iRealFilterElementCount = iProjectionCount * iFFTRealDetCount;
-			float * pfHostRealFilter = new float[iRealFilterElementCount];
-			memset(pfHostRealFilter, 0, sizeof(float) * iRealFilterElementCount);
-
-			int iUsedFilterWidth = min(_iFilterWidth, iFFTRealDetCount);
-			int iStartFilterIndex = (_iFilterWidth - iUsedFilterWidth) / 2;
-			int iMaxFilterIndex = iStartFilterIndex + iUsedFilterWidth;
-
-			int iFilterShiftSize = _iFilterWidth / 2;
-
-			for(int iDetectorIndex = iStartFilterIndex; iDetectorIndex < iMaxFilterIndex; iDetectorIndex++)
-			{
-				int iFFTInFilterIndex = (iDetectorIndex + iFFTRealDetCount - iFilterShiftSize) % iFFTRealDetCount;
-				float fValue = _pfHostFilter[iDetectorIndex];
-
-				for(int iProjectionIndex = 0; iProjectionIndex < (int)pData->dims.iProjAngles; iProjectionIndex++)
-				{
-					pfHostRealFilter[iFFTInFilterIndex + iProjectionIndex * iFFTRealDetCount] = fValue;
-				}
-			}
-
-			float* pfDevRealFilter = NULL;
-			cudaMalloc((void **)&pfDevRealFilter, sizeof(float) * iRealFilterElementCount); // TODO: check for errors
-			cudaMemcpy(pfDevRealFilter, pfHostRealFilter, sizeof(float) * iRealFilterElementCount, cudaMemcpyHostToDevice);
-			delete[] pfHostRealFilter;
-
-			runCudaFFT(iProjectionCount, pfDevRealFilter, iFFTRealDetCount, iFFTRealDetCount, iFFTRealDetCount, iFreqBinCount, pData->m_pDevFilter);
-
-			cudaFree(pfDevRealFilter);
-
-			break;
-		}
-		case FILTER_RSINOGRAM:
-		{
-			int iProjectionCount = pData->dims.iProjAngles;
-			int iRealFilterElementCount = iProjectionCount * iFFTRealDetCount;
-			float* pfHostRealFilter = new float[iRealFilterElementCount];
-			memset(pfHostRealFilter, 0, sizeof(float) * iRealFilterElementCount);
-
-			int iUsedFilterWidth = min(_iFilterWidth, iFFTRealDetCount);
-			int iStartFilterIndex = (_iFilterWidth - iUsedFilterWidth) / 2;
-			int iMaxFilterIndex = iStartFilterIndex + iUsedFilterWidth;
-
-			int iFilterShiftSize = _iFilterWidth / 2;
-
-			for(int iDetectorIndex = iStartFilterIndex; iDetectorIndex < iMaxFilterIndex; iDetectorIndex++)
-			{
-				int iFFTInFilterIndex = (iDetectorIndex + iFFTRealDetCount - iFilterShiftSize) % iFFTRealDetCount;
-
-				for(int iProjectionIndex = 0; iProjectionIndex < (int)pData->dims.iProjAngles; iProjectionIndex++)
-				{
-					float fValue = _pfHostFilter[iDetectorIndex + iProjectionIndex * _iFilterWidth];
-					pfHostRealFilter[iFFTInFilterIndex + iProjectionIndex * iFFTRealDetCount] = fValue;
-				}
-			}
-
-			float* pfDevRealFilter = NULL;
-			cudaMalloc((void **)&pfDevRealFilter, sizeof(float) * iRealFilterElementCount); // TODO: check for errors
-			cudaMemcpy(pfDevRealFilter, pfHostRealFilter, sizeof(float) * iRealFilterElementCount, cudaMemcpyHostToDevice);
-			delete[] pfHostRealFilter;
-
-			runCudaFFT(iProjectionCount, pfDevRealFilter, iFFTRealDetCount, iFFTRealDetCount, iFFTRealDetCount, iFreqBinCount, pData->m_pDevFilter);
-
-			cudaFree(pfDevRealFilter);
-
-			break;
-		}
-		default:
-		{
-			ASTRA_ERROR("AstraFBP::setFilter: Unknown filter type requested");
-			delete [] pHostFilter;
-			return false;
-		}
-	}
-
-	delete [] pHostFilter;
-
-	return true;
-}
-
 BPalgo::BPalgo()
 {
 
@@ -615,18 +109,17 @@ float BPalgo::computeDiffNorm()
 bool astraCudaFP(const float* pfVolume, float* pfSinogram,
                  unsigned int iVolWidth, unsigned int iVolHeight,
                  unsigned int iProjAngles, unsigned int iProjDets,
-                 const float *pfAngles, const float *pfOffsets,
-                 float fDetSize, unsigned int iDetSuperSampling,
+                 const SParProjection *pAngles,
+                 unsigned int iDetSuperSampling,
                  float fOutputScale, int iGPUIndex)
 {
 	SDimensions dims;
 
-	if (iProjAngles == 0 || iProjDets == 0 || pfAngles == 0)
+	if (iProjAngles == 0 || iProjDets == 0 || pAngles == 0)
 		return false;
 
 	dims.iProjAngles = iProjAngles;
 	dims.iProjDets = iProjDets;
-	dims.fDetScale = fDetSize;
 
 	if (iDetSuperSampling == 0)
 		return false;
@@ -676,7 +169,7 @@ bool astraCudaFP(const float* pfVolume, float* pfSinogram,
 	}
 
 	zeroProjectionData(D_sinoData, sinoPitch, dims);
-	ok = FP(D_volumeData, volumePitch, D_sinoData, sinoPitch, dims, pfAngles, pfOffsets, fOutputScale);
+	ok = FP(D_volumeData, volumePitch, D_sinoData, sinoPitch, dims, pAngles, fOutputScale);
 	if (!ok) {
 		cudaFree(D_volumeData);
 		cudaFree(D_sinoData);
@@ -711,7 +204,6 @@ bool astraCudaFanFP(const float* pfVolume, float* pfSinogram,
 
 	dims.iProjAngles = iProjAngles;
 	dims.iProjDets = iProjDets;
-	dims.fDetScale = 1.0f; // TODO?
 
 	if (iDetSuperSampling == 0)
 		return false;
@@ -787,123 +279,99 @@ bool astraCudaFanFP(const float* pfVolume, float* pfSinogram,
 }
 
 
-bool convertAstraGeometry(const CVolumeGeometry2D* pVolGeom,
-                          const CParallelProjectionGeometry2D* pProjGeom,
-                          float*& detectorOffsets, float*& projectionAngles,
-                          float& detSize, float& outputScale)
+// adjust pProjs to normalize volume geometry
+template<typename ProjectionT>
+static bool convertAstraGeometry_internal(const CVolumeGeometry2D* pVolGeom,
+                          unsigned int iProjectionAngleCount,
+                          ProjectionT*& pProjs,
+                          float& fOutputScale)
 {
-	assert(pVolGeom);
-	assert(pProjGeom);
-	assert(pProjGeom->getProjectionAngles());
-
+	// TODO: Make EPS relative
 	const float EPS = 0.00001f;
 
-	int nth = pProjGeom->getProjectionAngleCount();
-
 	// Check if pixels are square
 	if (abs(pVolGeom->getPixelLengthX() - pVolGeom->getPixelLengthY()) > EPS)
 		return false;
 
+	float dx = -(pVolGeom->getWindowMinX() + pVolGeom->getWindowMaxX()) / 2;
+	float dy = -(pVolGeom->getWindowMinY() + pVolGeom->getWindowMaxY()) / 2;
 
-	// Scale volume pixels to 1x1
-	detSize = pProjGeom->getDetectorWidth() / pVolGeom->getPixelLengthX();
+	float factor = 1.0f / pVolGeom->getPixelLengthX();
 
-	// Copy angles
-	float *angles = new float[nth];
-	for (int i = 0; i < nth; ++i)
-		angles[i] = pProjGeom->getProjectionAngles()[i];
-	projectionAngles = angles;
-
-	// Check if we need to translate
-	bool offCenter = false;
-	if (abs(pVolGeom->getWindowMinX() + pVolGeom->getWindowMaxX()) > EPS ||
-	    abs(pVolGeom->getWindowMinY() + pVolGeom->getWindowMaxY()) > EPS)
-	{
-		offCenter = true;
+	for (int i = 0; i < iProjectionAngleCount; ++i) {
+		// CHECKME: Order of scaling and translation
+		pProjs[i].translate(dx, dy);
+		pProjs[i].scale(factor);
 	}
+	// CHECKME: Check factor
+	fOutputScale *= pVolGeom->getPixelLengthX() * pVolGeom->getPixelLengthY();
 
-	// If there are existing detector offsets, or if we need to translate,
-	// we need to return offsets
-	if (pProjGeom->getExtraDetectorOffset() || offCenter)
-	{
-		float* offset = new float[nth];
-
-		if (pProjGeom->getExtraDetectorOffset()) {
-			for (int i = 0; i < nth; ++i)
-				offset[i] = pProjGeom->getExtraDetectorOffset()[i];
-		} else {
-			for (int i = 0; i < nth; ++i)
-				offset[i] = 0.0f;
-		}
+	return true;
+}
 
-		if (offCenter) {
-			float dx = (pVolGeom->getWindowMinX() + pVolGeom->getWindowMaxX()) / 2;
-			float dy = (pVolGeom->getWindowMinY() + pVolGeom->getWindowMaxY()) / 2;
 
-			// CHECKME: Is d in pixels or in units?
 
-			for (int i = 0; i < nth; ++i) {
-				float d = dx * cos(angles[i]) + dy * sin(angles[i]);
-				offset[i] += d;
-			}
-		}
+bool convertAstraGeometry(const CVolumeGeometry2D* pVolGeom,
+                          const CParallelProjectionGeometry2D* pProjGeom,
+                          SParProjection*& pProjs,
+                          float& fOutputScale)
+{
+	assert(pVolGeom);
+	assert(pProjGeom);
+	assert(pProjGeom->getProjectionAngles());
 
-		// CHECKME: Order of scaling and translation
+	int nth = pProjGeom->getProjectionAngleCount();
 
-		// Scale volume pixels to 1x1
-		for (int i = 0; i < nth; ++i) {
-			//offset[i] /= pVolGeom->getPixelLengthX();
-			//offset[i] *= detSize;
-		}
+	pProjs = genParProjections(nth,
+	                           pProjGeom->getDetectorCount(),
+	                           pProjGeom->getDetectorWidth(),
+	                           pProjGeom->getProjectionAngles(), 0);
 
+	bool ok;
+	fOutputScale = 1.0f;
 
-		detectorOffsets = offset;
-	} else {
-		detectorOffsets = 0;
-	}
+	ok = convertAstraGeometry_internal(pVolGeom, nth, pProjs, fOutputScale);
 
-	outputScale = pVolGeom->getPixelLengthX();
-	outputScale *= outputScale;
+	if (!ok) {
+		delete[] pProjs;
+		pProjs = 0;
+	}
 
-	return true;
+	return ok;
 }
 
-static void convertAstraGeometry_internal(const CVolumeGeometry2D* pVolGeom,
-                          unsigned int iProjectionAngleCount,
-                          astraCUDA::SFanProjection*& pProjs,
-                          float& outputScale)
+bool convertAstraGeometry(const CVolumeGeometry2D* pVolGeom,
+                          const CParallelVecProjectionGeometry2D* pProjGeom,
+                          SParProjection*& pProjs,
+                          float& fOutputScale)
 {
-	// Translate
-	float dx = (pVolGeom->getWindowMinX() + pVolGeom->getWindowMaxX()) / 2;
-	float dy = (pVolGeom->getWindowMinY() + pVolGeom->getWindowMaxY()) / 2;
+	assert(pVolGeom);
+	assert(pProjGeom);
+	assert(pProjGeom->getProjectionVectors());
 
-	for (int i = 0; i < iProjectionAngleCount; ++i) {
-		pProjs[i].fSrcX -= dx;
-		pProjs[i].fSrcY -= dy;
-		pProjs[i].fDetSX -= dx;
-		pProjs[i].fDetSY -= dy;
+	int nth = pProjGeom->getProjectionAngleCount();
+
+	pProjs = new SParProjection[nth];
+
+	for (int i = 0; i < nth; ++i) {
+		pProjs[i] = pProjGeom->getProjectionVectors()[i];
 	}
 
-	// CHECKME: Order of scaling and translation
+	bool ok;
+	fOutputScale = 1.0f;
 
-	// Scale
-	float factor = 1.0f / pVolGeom->getPixelLengthX();
-	for (int i = 0; i < iProjectionAngleCount; ++i) {
-		pProjs[i].fSrcX *= factor;
-		pProjs[i].fSrcY *= factor;
-		pProjs[i].fDetSX *= factor;
-		pProjs[i].fDetSY *= factor;
-		pProjs[i].fDetUX *= factor;
-		pProjs[i].fDetUY *= factor;
+	ok = convertAstraGeometry_internal(pVolGeom, nth, pProjs, fOutputScale);
 
+	if (!ok) {
+		delete[] pProjs;
+		pProjs = 0;
 	}
 
-	// CHECKME: Check factor
-	outputScale = pVolGeom->getPixelLengthX();
-//	outputScale *= outputScale;
+	return ok;
 }
 
 
+
 bool convertAstraGeometry(const CVolumeGeometry2D* pVolGeom,
                           const CFanFlatProjectionGeometry2D* pProjGeom,
                           astraCUDA::SFanProjection*& pProjs,
@@ -913,6 +381,7 @@ bool convertAstraGeometry(const CVolumeGeometry2D* pVolGeom,
 	assert(pProjGeom);
 	assert(pProjGeom->getProjectionAngles());
 
+	// TODO: Make EPS relative
 	const float EPS = 0.00001f;
 
 	int nth = pProjGeom->getProjectionAngleCount();
@@ -931,23 +400,9 @@ bool convertAstraGeometry(const CVolumeGeometry2D* pVolGeom,
 	float fDetSize = pProjGeom->getDetectorWidth();
 	const float *pfAngles = pProjGeom->getProjectionAngles();
 
-	pProjs = new SFanProjection[nth];
-
-	float fSrcX0 = 0.0f;
-	float fSrcY0 = -fOriginSourceDistance;
-	float fDetUX0 = fDetSize;
-	float fDetUY0 = 0.0f;
-	float fDetSX0 = pProjGeom->getDetectorCount() * fDetUX0 / -2.0f;
-	float fDetSY0 = fOriginDetectorDistance;
-
-#define ROTATE0(name,i,alpha) do { pProjs[i].f##name##X = f##name##X0 * cos(alpha) - f##name##Y0 * sin(alpha); pProjs[i].f##name##Y = f##name##X0 * sin(alpha) + f##name##Y0 * cos(alpha); } while(0)
-	for (int i = 0; i < nth; ++i) {
-		ROTATE0(Src, i, pfAngles[i]);
-		ROTATE0(DetS, i, pfAngles[i]);
-		ROTATE0(DetU, i, pfAngles[i]);
-	}
-
-#undef ROTATE0
+	pProjs = genFanProjections(nth, pProjGeom->getDetectorCount(),
+                               fOriginSourceDistance, fOriginDetectorDistance,
+	                           fDetSize, pfAngles);
 
 	convertAstraGeometry_internal(pVolGeom, nth, pProjs, outputScale);
 
@@ -964,6 +419,7 @@ bool convertAstraGeometry(const CVolumeGeometry2D* pVolGeom,
 	assert(pProjGeom);
 	assert(pProjGeom->getProjectionVectors());
 
+	// TODO: Make EPS relative
 	const float EPS = 0.00001f;
 
 	int nx = pVolGeom->getGridColCount();
@@ -985,6 +441,51 @@ bool convertAstraGeometry(const CVolumeGeometry2D* pVolGeom,
 	return true;
 }
 
+bool convertAstraGeometry(const CVolumeGeometry2D* pVolGeom,
+                          const CProjectionGeometry2D* pProjGeom,
+                          astraCUDA::SParProjection*& pParProjs,
+                          astraCUDA::SFanProjection*& pFanProjs,
+                          float& outputScale)
+{
+	const CParallelProjectionGeometry2D* parProjGeom = dynamic_cast<const CParallelProjectionGeometry2D*>(pProjGeom);
+	const CParallelVecProjectionGeometry2D* parVecProjGeom = dynamic_cast<const CParallelVecProjectionGeometry2D*>(pProjGeom);
+	const CFanFlatProjectionGeometry2D* fanProjGeom = dynamic_cast<const CFanFlatProjectionGeometry2D*>(pProjGeom);
+	const CFanFlatVecProjectionGeometry2D* fanVecProjGeom = dynamic_cast<const CFanFlatVecProjectionGeometry2D*>(pProjGeom);
+
+	bool ok = false;
+
+	if (parProjGeom) {
+		ok = convertAstraGeometry(pVolGeom, parProjGeom, pParProjs, outputScale);
+	} else if (parVecProjGeom) {
+		ok = convertAstraGeometry(pVolGeom, parVecProjGeom, pParProjs, outputScale);
+	} else if (fanProjGeom) {
+		ok = convertAstraGeometry(pVolGeom, fanProjGeom, pFanProjs, outputScale);
+	} else if (fanVecProjGeom) {
+		ok = convertAstraGeometry(pVolGeom, fanVecProjGeom, pFanProjs, outputScale);
+	} else {
+		ok = false;
+	}
+
+	return ok;
+}
+
+bool convertAstraGeometry_dims(const CVolumeGeometry2D* pVolGeom,
+                               const CProjectionGeometry2D* pProjGeom,
+                               SDimensions& dims)
+{
+	dims.iVolWidth = pVolGeom->getGridColCount();
+	dims.iVolHeight = pVolGeom->getGridRowCount();
+
+	dims.iProjAngles = pProjGeom->getProjectionAngleCount();
+	dims.iProjDets = pProjGeom->getDetectorCount();
+
+	dims.iRaysPerDet = 1;
+	dims.iRaysPerPixelDim = 1;
+
+	return true;
+}
+
+
 
 }
 
diff --git a/cuda/2d/astra.h b/cuda/2d/astra.h
index e4cefac..9355cb4 100644
--- a/cuda/2d/astra.h
+++ b/cuda/2d/astra.h
@@ -41,140 +41,12 @@ enum Cuda2DProjectionKernel {
 };
 
 class CParallelProjectionGeometry2D;
+class CParallelVecProjectionGeometry2D;
 class CFanFlatProjectionGeometry2D;
 class CFanFlatVecProjectionGeometry2D;
 class CVolumeGeometry2D;
+class CProjectionGeometry2D;
 
-class AstraFBP_internal;
-
-class _AstraExport AstraFBP {
-public:
-	// Constructor
-	AstraFBP();
-
-	// Destructor
-	~AstraFBP();
-
-	// Set the size of the reconstruction rectangle.
-	// Volume pixels are currently assumed to be 1x1 squares.
-	bool setReconstructionGeometry(unsigned int iVolWidth,
-	                               unsigned int iVolHeight,
-	                               float fPixelSize = 1.0f);
-
-	// Set the projection angles and number of detector pixels per angle.
-	// pfAngles must be a float array of length iProjAngles.
-	// fDetSize indicates the size of a detector pixel compared to a
-	// volume pixel edge.
-	//
-	// pfAngles will only be read from during this call.
-	bool setProjectionGeometry(unsigned int iProjAngles,
-	                           unsigned int iProjDets,
-	                           const float *pfAngles,
-	                           float fDetSize = 1.0f);
-	// Set the projection angles and number of detector pixels per angle.
-	// pfAngles must be a float array of length iProjAngles.
-	// fDetSize indicates the size of a detector pixel compared to a
-	// volume pixel edge.
-	//
-	// pfAngles, fanProjs will only be read from during this call.
-	bool setFanGeometry(unsigned int iProjAngles,
-	                    unsigned int iProjDets,
-	                    const astraCUDA::SFanProjection *fanProjs,
-	                    const float *pfAngles,
-	                    float fOriginSourceDistance,
-	                    float fOriginDetectorDistance,
-	                    float fDetSize = 1.0f,
-	                    bool bShortScan = false);
-
-	// Set linear supersampling factor for the BP.
-	// (The number of rays is the square of this)
-	//
-	// This may optionally be called before init().
-	bool setPixelSuperSampling(unsigned int iPixelSuperSampling);
-
-	// Set per-detector shifts.
-	//
-	// pfTOffsets will only be read from during this call.
-	bool setTOffsets(const float *pfTOffsets);
-
-	// Returns the required size of a filter in the fourier domain
-	// when multiplying it with the fft of the projection data.
-	// Its value is equal to the smallest power of two larger than
-	// or equal to twice the number of detectors in the spatial domain.
-	//
-	// _iDetectorCount is the number of detectors in the spatial domain.
-	static int calcFourierFilterSize(int _iDetectorCount);
-
-	// Sets the filter type. Some filter types require the user to supply an
-	// array containing the filter.
-	// The number of elements in a filter in the fourier domain should be equal
-	// to the value returned by calcFourierFilterSize().
-	// The following types require a filter:
-	//
-	// - FILTER_PROJECTION:
-	// The filter size should be equal to the output of
-	// calcFourierFilterSize(). The filtered sinogram is
-	// multiplied with the supplied filter.
-	//
-	// - FILTER_SINOGRAM:
-	// Same as FILTER_PROJECTION, but now the filter should contain a row for
-	// every projection direction.
-	//
-	// - FILTER_RPROJECTION:
-	// The filter should now contain one kernel (= ifft of filter), with the
-	// peak in the center. The filter width
-	// can be any value. If odd, the peak is assumed to be in the center, if
-	// even, it is assumed to be at floor(filter-width/2).
-	//
-	// - FILTER_RSINOGRAM
-	// Same as FILTER_RPROJECTION, but now the supplied filter should contain a
-	// row for every projection direction.
-	//
-	// A large number of other filters (FILTER_RAMLAK, FILTER_SHEPPLOGAN,
-	// FILTER_COSINE, FILTER_HAMMING, and FILTER_HANN)
-	// have a D variable, which gives the cutoff point in the frequency domain.
-	// Setting this value to 1.0 will include the whole filter
-	bool setFilter(E_FBPFILTER _eFilter,
-                   const float * _pfHostFilter = NULL,
-                   int _iFilterWidth = 0, float _fD = 1.0f, float _fFilterParameter = -1.0f);
-
-	// Initialize CUDA, allocate GPU buffers and
-	// precompute geometry-specific data.
-	//
-	// CUDA is set up to use GPU number iGPUIndex.
-	//
-	// This must be called after calling setReconstructionGeometry() and
-	// setProjectionGeometry().
-	bool init(int iGPUIndex = 0);
-
-	// Setup input sinogram for a slice.
-	// pfSinogram must be a float array of size iProjAngles*iSinogramPitch.
-	// NB: iSinogramPitch is measured in floats, not in bytes.
-	//
-	// This must be called after init(), and before iterate(). It may be
-	// called again after iterate()/getReconstruction() to start a new slice.
-	//
-	// pfSinogram will only be read from during this call.
-	bool setSinogram(const float* pfSinogram, unsigned int iSinogramPitch);
-
-	// Runs an FBP reconstruction.
-	// This must be called after setSinogram().
-	//
-	// run can be called before setFilter, but will then use the default Ram-Lak filter
-	bool run();
-
-	// Get the reconstructed slice.
-	// pfReconstruction must be a float array of size
-	// iVolHeight*iReconstructionPitch.
-	// NB: iReconstructionPitch is measured in floats, not in bytes.
-	//
-	// This may be called after run().
-	bool getReconstruction(float* pfReconstruction,
-	                       unsigned int iReconstructionPitch) const;
-
-private:
-	AstraFBP_internal* pData;
-};
 
 class _AstraExport BPalgo : public astraCUDA::ReconAlgo {
 public:
@@ -197,8 +69,8 @@ public:
 _AstraExport bool astraCudaFP(const float* pfVolume, float* pfSinogram,
                  unsigned int iVolWidth, unsigned int iVolHeight,
                  unsigned int iProjAngles, unsigned int iProjDets,
-                 const float *pfAngles, const float *pfOffsets,
-                 float fDetSize = 1.0f, unsigned int iDetSuperSampling = 1,
+                 const SParProjection *pAngles,
+                 unsigned int iDetSuperSampling = 1,
                  float fOutputScale = 1.0f, int iGPUIndex = 0);
 
 _AstraExport bool astraCudaFanFP(const float* pfVolume, float* pfSinogram,
@@ -211,8 +83,14 @@ _AstraExport bool astraCudaFanFP(const float* pfVolume, float* pfSinogram,
 
 _AstraExport bool convertAstraGeometry(const CVolumeGeometry2D* pVolGeom,
                     const CParallelProjectionGeometry2D* pProjGeom,
-                    float*& pfDetectorOffsets, float*& pfProjectionAngles,
-                    float& fDetSize, float& fOutputScale);
+                    astraCUDA::SParProjection*& pProjs,
+                    float& fOutputScale);
+
+_AstraExport bool convertAstraGeometry(const CVolumeGeometry2D* pVolGeom,
+                    const CParallelVecProjectionGeometry2D* pProjGeom,
+                    astraCUDA::SParProjection*& pProjs,
+                    float& fOutputScale);
+
 
 _AstraExport bool convertAstraGeometry(const CVolumeGeometry2D* pVolGeom,
                     const CFanFlatProjectionGeometry2D* pProjGeom,
@@ -224,6 +102,15 @@ _AstraExport bool convertAstraGeometry(const CVolumeGeometry2D* pVolGeom,
                     astraCUDA::SFanProjection*& pProjs,
                     float& outputScale);
 
+_AstraExport bool convertAstraGeometry_dims(const CVolumeGeometry2D* pVolGeom,
+                          const CProjectionGeometry2D* pProjGeom,
+                          astraCUDA::SDimensions& dims);
+
+_AstraExport bool convertAstraGeometry(const CVolumeGeometry2D* pVolGeom,
+                          const CProjectionGeometry2D* pProjGeom,
+                          astraCUDA::SParProjection*& pParProjs,
+                          astraCUDA::SFanProjection*& pFanProjs,
+                          float& outputScale);
 }
 
 namespace astraCUDA {
diff --git a/cuda/2d/cgls.cu b/cuda/2d/cgls.cu
index 6962d81..1a31438 100644
--- a/cuda/2d/cgls.cu
+++ b/cuda/2d/cgls.cu
@@ -206,42 +206,6 @@ float CGLS::computeDiffNorm()
 	return sqrt(s);
 }
 
-bool doCGLS(float* D_volumeData, unsigned int volumePitch,
-            float* D_sinoData, unsigned int sinoPitch,
-            const SDimensions& dims, /*const SAugmentedData& augs,*/
-            const float* angles, const float* TOffsets, unsigned int iterations)
-{
-	CGLS cgls;
-	bool ok = true;
-
-	ok &= cgls.setGeometry(dims, angles);
-#if 0
-	if (D_maskData)
-		ok &= cgls.enableVolumeMask();
-#endif
-	if (TOffsets)
-		ok &= cgls.setTOffsets(TOffsets);
-
-	if (!ok)
-		return false;
-
-	ok = cgls.init();
-	if (!ok)
-		return false;
-
-#if 0
-	if (D_maskData)
-		ok &= cgls.setVolumeMask(D_maskData, maskPitch);
-#endif
-
-	ok &= cgls.setBuffers(D_volumeData, volumePitch, D_sinoData, sinoPitch);
-	if (!ok)
-		return false;
-
-	ok = cgls.iterate(iterations);
-
-	return ok;
-}
 
 }
 
diff --git a/cuda/2d/dims.h b/cuda/2d/dims.h
index afb997a..10eaabf 100644
--- a/cuda/2d/dims.h
+++ b/cuda/2d/dims.h
@@ -33,9 +33,11 @@ along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
 
 namespace astraCUDA {
 
+using astra::SParProjection;
 using astra::SFanProjection;
 
 
+
 struct SDimensions {
 	// Width, height of reconstruction volume
 	unsigned int iVolWidth;
@@ -47,9 +49,6 @@ struct SDimensions {
 	// Number of detector pixels
 	unsigned int iProjDets;
 
-	// size of detector compared to volume pixels
-	float fDetScale;
-
 	// in FP, number of rays to trace per detector pixel.
 	// This should usually be set to 1.
 	// If fDetScale > 1, this should be set to an integer of roughly
diff --git a/cuda/2d/em.cu b/cuda/2d/em.cu
index 11ed45b..03cc652 100644
--- a/cuda/2d/em.cu
+++ b/cuda/2d/em.cu
@@ -169,34 +169,6 @@ float EM::computeDiffNorm()
 }
 
 
-bool doEM(float* D_volumeData, unsigned int volumePitch,
-          float* D_sinoData, unsigned int sinoPitch,
-          const SDimensions& dims, const float* angles,
-          const float* TOffsets, unsigned int iterations)
-{
-	EM em;
-	bool ok = true;
-
-	ok &= em.setGeometry(dims, angles);
-	if (TOffsets)
-		ok &= em.setTOffsets(TOffsets);
-
-	if (!ok)
-		return false;
-
-	ok = em.init();
-	if (!ok)
-		return false;
-
-	ok &= em.setBuffers(D_volumeData, volumePitch, D_sinoData, sinoPitch);
-	if (!ok)
-		return false;
-
-	ok = em.iterate(iterations);
-
-	return ok;
-}
-
 }
 
 #ifdef STANDALONE
diff --git a/cuda/2d/fbp.cu b/cuda/2d/fbp.cu
new file mode 100644
index 0000000..365fc88
--- /dev/null
+++ b/cuda/2d/fbp.cu
@@ -0,0 +1,347 @@
+/*
+-----------------------------------------------------------------------
+Copyright: 2010-2015, iMinds-Vision Lab, University of Antwerp
+           2014-2015, CWI, Amsterdam
+
+Contact: astra@uantwerpen.be
+Website: http://sf.net/projects/astra-toolbox
+
+This file is part of the ASTRA Toolbox.
+
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#include "fbp.h"
+#include "fft.h"
+#include "par_bp.h"
+#include "fan_bp.h"
+#include "util.h"
+
+// For fan-beam preweighting
+#include "../3d/fdk.h"
+
+#include "astra/Logging.h"
+
+#include <cuda.h>
+
+namespace astraCUDA {
+
+
+
+static int calcNextPowerOfTwo(int n)
+{
+	int x = 1;
+	while (x < n)
+		x *= 2;
+
+	return x;
+}
+
+// static
+int FBP::calcFourierFilterSize(int _iDetectorCount)
+{
+	int iFFTRealDetCount = calcNextPowerOfTwo(2 * _iDetectorCount);
+	int iFreqBinCount = calcFFTFourierSize(iFFTRealDetCount);
+
+	// CHECKME: Matlab makes this at least 64. Do we also need to?
+	return iFreqBinCount;
+}
+
+
+
+
+FBP::FBP() : ReconAlgo()
+{
+	D_filter = 0;
+
+}
+
+FBP::~FBP()
+{
+	reset();
+}
+
+void FBP::reset()
+{
+	if (D_filter) {
+		freeComplexOnDevice((cufftComplex *)D_filter);
+		D_filter = 0;
+	}
+}
+
+bool FBP::init()
+{
+	return true;
+}
+
+bool FBP::setFilter(astra::E_FBPFILTER _eFilter, const float * _pfHostFilter /* = NULL */, int _iFilterWidth /* = 0 */, float _fD /* = 1.0f */, float _fFilterParameter /* = -1.0f */)
+{
+	if (D_filter)
+	{
+		freeComplexOnDevice((cufftComplex*)D_filter);
+		D_filter = 0;
+	}
+
+	if (_eFilter == astra::FILTER_NONE)
+		return true; // leave D_filter set to 0
+
+
+	int iFFTRealDetCount = calcNextPowerOfTwo(2 * dims.iProjDets);
+	int iFreqBinCount = calcFFTFourierSize(iFFTRealDetCount);
+
+	cufftComplex * pHostFilter = new cufftComplex[dims.iProjAngles * iFreqBinCount];
+	memset(pHostFilter, 0, sizeof(cufftComplex) * dims.iProjAngles * iFreqBinCount);
+
+	allocateComplexOnDevice(dims.iProjAngles, iFreqBinCount, (cufftComplex**)&D_filter);
+
+	switch(_eFilter)
+	{
+		case astra::FILTER_NONE:
+			// handled above
+			break;
+		case astra::FILTER_RAMLAK:
+		case astra::FILTER_SHEPPLOGAN:
+		case astra::FILTER_COSINE:
+		case astra::FILTER_HAMMING:
+		case astra::FILTER_HANN:
+		case astra::FILTER_TUKEY:
+		case astra::FILTER_LANCZOS:
+		case astra::FILTER_TRIANGULAR:
+		case astra::FILTER_GAUSSIAN:
+		case astra::FILTER_BARTLETTHANN:
+		case astra::FILTER_BLACKMAN:
+		case astra::FILTER_NUTTALL:
+		case astra::FILTER_BLACKMANHARRIS:
+		case astra::FILTER_BLACKMANNUTTALL:
+		case astra::FILTER_FLATTOP:
+		case astra::FILTER_PARZEN:
+		{
+			genFilter(_eFilter, _fD, dims.iProjAngles, pHostFilter, iFFTRealDetCount, iFreqBinCount, _fFilterParameter);
+			uploadComplexArrayToDevice(dims.iProjAngles, iFreqBinCount, pHostFilter, (cufftComplex*)D_filter);
+
+			break;
+		}
+		case astra::FILTER_PROJECTION:
+		{
+			// make sure the offered filter has the correct size
+			assert(_iFilterWidth == iFreqBinCount);
+
+			for(int iFreqBinIndex = 0; iFreqBinIndex < iFreqBinCount; iFreqBinIndex++)
+			{
+				float fValue = _pfHostFilter[iFreqBinIndex];
+
+				for(int iProjectionIndex = 0; iProjectionIndex < (int)dims.iProjAngles; iProjectionIndex++)
+				{
+					pHostFilter[iFreqBinIndex + iProjectionIndex * iFreqBinCount].x = fValue;
+					pHostFilter[iFreqBinIndex + iProjectionIndex * iFreqBinCount].y = 0.0f;
+				}
+			}
+			uploadComplexArrayToDevice(dims.iProjAngles, iFreqBinCount, pHostFilter, (cufftComplex*)D_filter);
+			break;
+		}
+		case astra::FILTER_SINOGRAM:
+		{
+			// make sure the offered filter has the correct size
+			assert(_iFilterWidth == iFreqBinCount);
+
+			for(int iFreqBinIndex = 0; iFreqBinIndex < iFreqBinCount; iFreqBinIndex++)
+			{
+				for(int iProjectionIndex = 0; iProjectionIndex < (int)dims.iProjAngles; iProjectionIndex++)
+				{
+					float fValue = _pfHostFilter[iFreqBinIndex + iProjectionIndex * _iFilterWidth];
+
+					pHostFilter[iFreqBinIndex + iProjectionIndex * iFreqBinCount].x = fValue;
+					pHostFilter[iFreqBinIndex + iProjectionIndex * iFreqBinCount].y = 0.0f;
+				}
+			}
+			uploadComplexArrayToDevice(dims.iProjAngles, iFreqBinCount, pHostFilter, (cufftComplex*)D_filter);
+			break;
+		}
+		case astra::FILTER_RPROJECTION:
+		{
+			int iProjectionCount = dims.iProjAngles;
+			int iRealFilterElementCount = iProjectionCount * iFFTRealDetCount;
+			float * pfHostRealFilter = new float[iRealFilterElementCount];
+			memset(pfHostRealFilter, 0, sizeof(float) * iRealFilterElementCount);
+
+			int iUsedFilterWidth = min(_iFilterWidth, iFFTRealDetCount);
+			int iStartFilterIndex = (_iFilterWidth - iUsedFilterWidth) / 2;
+			int iMaxFilterIndex = iStartFilterIndex + iUsedFilterWidth;
+
+			int iFilterShiftSize = _iFilterWidth / 2;
+
+			for(int iDetectorIndex = iStartFilterIndex; iDetectorIndex < iMaxFilterIndex; iDetectorIndex++)
+			{
+				int iFFTInFilterIndex = (iDetectorIndex + iFFTRealDetCount - iFilterShiftSize) % iFFTRealDetCount;
+				float fValue = _pfHostFilter[iDetectorIndex];
+
+				for(int iProjectionIndex = 0; iProjectionIndex < (int)dims.iProjAngles; iProjectionIndex++)
+				{
+					pfHostRealFilter[iFFTInFilterIndex + iProjectionIndex * iFFTRealDetCount] = fValue;
+				}
+			}
+
+			float* pfDevRealFilter = NULL;
+			cudaMalloc((void **)&pfDevRealFilter, sizeof(float) * iRealFilterElementCount); // TODO: check for errors
+			cudaMemcpy(pfDevRealFilter, pfHostRealFilter, sizeof(float) * iRealFilterElementCount, cudaMemcpyHostToDevice);
+			delete[] pfHostRealFilter;
+
+			runCudaFFT(iProjectionCount, pfDevRealFilter, iFFTRealDetCount, iFFTRealDetCount, iFFTRealDetCount, iFreqBinCount, (cufftComplex*)D_filter);
+
+			cudaFree(pfDevRealFilter);
+
+			break;
+		}
+		case astra::FILTER_RSINOGRAM:
+		{
+			int iProjectionCount = dims.iProjAngles;
+			int iRealFilterElementCount = iProjectionCount * iFFTRealDetCount;
+			float* pfHostRealFilter = new float[iRealFilterElementCount];
+			memset(pfHostRealFilter, 0, sizeof(float) * iRealFilterElementCount);
+
+			int iUsedFilterWidth = min(_iFilterWidth, iFFTRealDetCount);
+			int iStartFilterIndex = (_iFilterWidth - iUsedFilterWidth) / 2;
+			int iMaxFilterIndex = iStartFilterIndex + iUsedFilterWidth;
+
+			int iFilterShiftSize = _iFilterWidth / 2;
+
+			for(int iDetectorIndex = iStartFilterIndex; iDetectorIndex < iMaxFilterIndex; iDetectorIndex++)
+			{
+				int iFFTInFilterIndex = (iDetectorIndex + iFFTRealDetCount - iFilterShiftSize) % iFFTRealDetCount;
+
+				for(int iProjectionIndex = 0; iProjectionIndex < (int)dims.iProjAngles; iProjectionIndex++)
+				{
+					float fValue = _pfHostFilter[iDetectorIndex + iProjectionIndex * _iFilterWidth];
+					pfHostRealFilter[iFFTInFilterIndex + iProjectionIndex * iFFTRealDetCount] = fValue;
+				}
+			}
+
+			float* pfDevRealFilter = NULL;
+			cudaMalloc((void **)&pfDevRealFilter, sizeof(float) * iRealFilterElementCount); // TODO: check for errors
+			cudaMemcpy(pfDevRealFilter, pfHostRealFilter, sizeof(float) * iRealFilterElementCount, cudaMemcpyHostToDevice);
+			delete[] pfHostRealFilter;
+
+			runCudaFFT(iProjectionCount, pfDevRealFilter, iFFTRealDetCount, iFFTRealDetCount, iFFTRealDetCount, iFreqBinCount, (cufftComplex*)D_filter);
+
+			cudaFree(pfDevRealFilter);
+
+			break;
+		}
+		default:
+		{
+			ASTRA_ERROR("FBP::setFilter: Unknown filter type requested");
+			delete [] pHostFilter;
+			return false;
+		}
+	}
+
+	delete [] pHostFilter;
+
+	return true;
+}
+
+bool FBP::iterate(unsigned int iterations)
+{
+	zeroVolumeData(D_volumeData, volumePitch, dims);
+
+	bool ok = false;
+
+	if (fanProjs) {
+		// Call FDK_PreWeight to handle fan beam geometry. We treat
+		// this as a cone beam setup of a single slice:
+
+		// TODO: TOffsets affects this preweighting...
+
+		// TODO: We take the fan parameters from the last projection here
+		// without checking if they're the same in all projections
+
+		float *pfAngles = new float[dims.iProjAngles];
+
+		float fOriginSource, fOriginDetector, fDetSize, fOffset;
+		for (unsigned int i = 0; i < dims.iProjAngles; ++i) {
+			bool ok = astra::getFanParameters(fanProjs[i], dims.iProjDets,
+			                                  pfAngles[i],
+			                                  fOriginSource, fOriginDetector,
+			                                  fDetSize, fOffset);
+			if (!ok) {
+				ASTRA_ERROR("FBP_CUDA: Failed to extract circular fan beam parameters from fan beam geometry");
+				return false;
+			}
+		}
+
+		// We create a fake cudaPitchedPtr
+		cudaPitchedPtr tmp;
+		tmp.ptr = D_sinoData;
+		tmp.pitch = sinoPitch * sizeof(float);
+		tmp.xsize = dims.iProjDets;
+		tmp.ysize = dims.iProjAngles;
+		// and a fake Dimensions3D
+		astraCUDA3d::SDimensions3D dims3d;
+		dims3d.iVolX = dims.iVolWidth;
+		dims3d.iVolY = dims.iVolHeight;
+		dims3d.iVolZ = 1;
+		dims3d.iProjAngles = dims.iProjAngles;
+		dims3d.iProjU = dims.iProjDets;
+		dims3d.iProjV = 1;
+
+		astraCUDA3d::FDK_PreWeight(tmp, fOriginSource,
+		              fOriginDetector, 0.0f,
+		              fDetSize, 1.0f,
+		              m_bShortScan, dims3d, pfAngles);
+	} else {
+		// TODO: How should different detector pixel size in different
+		// projections be handled?
+	}
+
+	if (D_filter) {
+
+		int iFFTRealDetCount = calcNextPowerOfTwo(2 * dims.iProjDets);
+		int iFFTFourDetCount = calcFFTFourierSize(iFFTRealDetCount);
+
+		cufftComplex * pDevComplexSinogram = NULL;
+
+		allocateComplexOnDevice(dims.iProjAngles, iFFTFourDetCount, &pDevComplexSinogram);
+
+		runCudaFFT(dims.iProjAngles, D_sinoData, sinoPitch, dims.iProjDets, iFFTRealDetCount, iFFTFourDetCount, pDevComplexSinogram);
+
+		applyFilter(dims.iProjAngles, iFFTFourDetCount, pDevComplexSinogram, (cufftComplex*)D_filter);
+
+		runCudaIFFT(dims.iProjAngles, pDevComplexSinogram, D_sinoData, sinoPitch, dims.iProjDets, iFFTRealDetCount, iFFTFourDetCount);
+
+		freeComplexOnDevice(pDevComplexSinogram);
+
+	}
+
+	float fOutputScale = (M_PI / 2.0f) / (float)dims.iProjAngles;
+
+	if (fanProjs) {
+		ok = FanBP_FBPWeighted(D_volumeData, volumePitch, D_sinoData, sinoPitch, dims, fanProjs, fOutputScale);
+
+	} else {
+		ok = BP(D_volumeData, volumePitch, D_sinoData, sinoPitch, dims, parProjs, fOutputScale);
+	}
+	if(!ok)
+	{
+		return false;
+	}
+
+	return true;
+}
+
+
+}
diff --git a/cuda/2d/fbp.h b/cuda/2d/fbp.h
new file mode 100644
index 0000000..8b4d64d
--- /dev/null
+++ b/cuda/2d/fbp.h
@@ -0,0 +1,98 @@
+/*
+-----------------------------------------------------------------------
+Copyright: 2010-2015, iMinds-Vision Lab, University of Antwerp
+           2014-2015, CWI, Amsterdam
+
+Contact: astra@uantwerpen.be
+Website: http://sf.net/projects/astra-toolbox
+
+This file is part of the ASTRA Toolbox.
+
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#include "algo.h"
+#include "fbp_filters.h"
+
+namespace astraCUDA {
+
+class _AstraExport FBP : public ReconAlgo {
+public:
+	FBP();
+	~FBP();
+
+	virtual bool useSinogramMask() { return false; }
+	virtual bool useVolumeMask() { return false; }
+
+	// Returns the required size of a filter in the fourier domain
+	// when multiplying it with the fft of the projection data.
+	// Its value is equal to the smallest power of two larger than
+	// or equal to twice the number of detectors in the spatial domain.
+	//
+	// _iDetectorCount is the number of detectors in the spatial domain.
+	static int calcFourierFilterSize(int _iDetectorCount);
+
+	// Sets the filter type. Some filter types require the user to supply an
+	// array containing the filter.
+	// The number of elements in a filter in the fourier domain should be equal
+	// to the value returned by calcFourierFilterSize().
+	// The following types require a filter:
+	//
+	// - FILTER_PROJECTION:
+	// The filter size should be equal to the output of
+	// calcFourierFilterSize(). The filtered sinogram is
+	// multiplied with the supplied filter.
+	//
+	// - FILTER_SINOGRAM:
+	// Same as FILTER_PROJECTION, but now the filter should contain a row for
+	// every projection direction.
+	//
+	// - FILTER_RPROJECTION:
+	// The filter should now contain one kernel (= ifft of filter), with the
+	// peak in the center. The filter width
+	// can be any value. If odd, the peak is assumed to be in the center, if
+	// even, it is assumed to be at floor(filter-width/2).
+	//
+	// - FILTER_RSINOGRAM
+	// Same as FILTER_RPROJECTION, but now the supplied filter should contain a
+	// row for every projection direction.
+	//
+	// A large number of other filters (FILTER_RAMLAK, FILTER_SHEPPLOGAN,
+	// FILTER_COSINE, FILTER_HAMMING, and FILTER_HANN)
+	// have a D variable, which gives the cutoff point in the frequency domain.
+	// Setting this value to 1.0 will include the whole filter
+	bool setFilter(astra::E_FBPFILTER _eFilter,
+                   const float * _pfHostFilter = NULL,
+                   int _iFilterWidth = 0, float _fD = 1.0f, float _fFilterParameter = -1.0f);
+
+	bool setShortScan(bool ss) { m_bShortScan = ss; return true; }
+
+	virtual bool init();
+
+	virtual bool iterate(unsigned int iterations);
+
+	virtual float computeDiffNorm() { return 0.0f; } // TODO
+
+protected:
+	void reset();
+
+	void* D_filter; // cufftComplex*
+	bool m_bShortScan;
+};
+
+}
diff --git a/cuda/2d/fbp_filters.h b/cuda/2d/fbp_filters.h
index f1223c5..bf1342d 100644
--- a/cuda/2d/fbp_filters.h
+++ b/cuda/2d/fbp_filters.h
@@ -28,6 +28,8 @@ along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
 #ifndef FBP_FILTERS_H
 #define FBP_FILTERS_H
 
+namespace astra {
+
 enum E_FBPFILTER
 {
 	FILTER_NONE,			//< no filter (regular BP)
@@ -54,4 +56,6 @@ enum E_FBPFILTER
 	FILTER_RSINOGRAM,		//< sinogram filter in real space
 };
 
+}
+
 #endif /* FBP_FILTERS_H */
diff --git a/cuda/2d/fft.cu b/cuda/2d/fft.cu
index 654bbae..5500e14 100644
--- a/cuda/2d/fft.cu
+++ b/cuda/2d/fft.cu
@@ -63,6 +63,8 @@ using namespace astra;
   } } while (0)
 
 
+namespace astraCUDA {
+
 __global__ static void applyFilter_kernel(int _iProjectionCount,
                                           int _iFreqBinCount,
                                           cufftComplex * _pSinogram,
@@ -275,11 +277,11 @@ bool runCudaIFFT(int _iProjectionCount, const cufftComplex* _pDevSourceComplex,
 // Because the input is real, the Fourier transform is symmetric.
 // CUFFT only outputs the first half (ignoring the redundant second half),
 // and expects the same as input for the IFFT.
-int calcFFTFourSize(int _iFFTRealSize)
+int calcFFTFourierSize(int _iFFTRealSize)
 {
-	int iFFTFourSize = _iFFTRealSize / 2 + 1;
+	int iFFTFourierSize = _iFFTRealSize / 2 + 1;
 
-	return iFFTFourSize;
+	return iFFTFourierSize;
 }
 
 void genIdenFilter(int _iProjectionCount, cufftComplex * _pFilter,
@@ -694,6 +696,10 @@ void genFilter(E_FBPFILTER _eFilter, float _fD, int _iProjectionCount,
 	delete[] pfW;
 }
 
+
+}
+
+
 #ifdef STANDALONE
 
 __global__ static void doubleFourierOutput_kernel(int _iProjectionCount,
diff --git a/cuda/2d/fft.h b/cuda/2d/fft.h
index 91f05f2..3e91043 100644
--- a/cuda/2d/fft.h
+++ b/cuda/2d/fft.h
@@ -33,6 +33,8 @@ along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
 
 #include "fbp_filters.h"
 
+namespace astraCUDA {
+
 bool allocateComplexOnDevice(int _iProjectionCount,
                              int _iDetectorCount,
                              cufftComplex ** _ppDevComplex);
@@ -56,13 +58,15 @@ bool runCudaIFFT(int _iProjectionCount, const cufftComplex* _pDevSourceComplex,
 void applyFilter(int _iProjectionCount, int _iFreqBinCount,
                  cufftComplex * _pSinogram, cufftComplex * _pFilter);
 
-int calcFFTFourSize(int _iFFTRealSize);
+int calcFFTFourierSize(int _iFFTRealSize);
 
-void genFilter(E_FBPFILTER _eFilter, float _fD, int _iProjectionCount,
+void genFilter(astra::E_FBPFILTER _eFilter, float _fD, int _iProjectionCount,
                cufftComplex * _pFilter, int _iFFTRealDetectorCount,
                int _iFFTFourierDetectorCount, float _fParameter = -1.0f);
 
 void genIdenFilter(int _iProjectionCount, cufftComplex * _pFilter,
                    int _iFFTRealDetectorCount, int _iFFTFourierDetectorCount);
 
+}
+
 #endif /* FFT_H */
diff --git a/cuda/2d/par_bp.cu b/cuda/2d/par_bp.cu
index dfdd193..307c561 100644
--- a/cuda/2d/par_bp.cu
+++ b/cuda/2d/par_bp.cu
@@ -52,8 +52,8 @@ const unsigned int g_blockSlices = 16;
 
 const unsigned int g_MaxAngles = 2560;
 
-__constant__ float gC_angle_sin[g_MaxAngles];
-__constant__ float gC_angle_cos[g_MaxAngles];
+__constant__ float gC_angle_scaled_sin[g_MaxAngles];
+__constant__ float gC_angle_scaled_cos[g_MaxAngles];
 __constant__ float gC_angle_offset[g_MaxAngles];
 
 static bool bindProjDataTexture(float* data, unsigned int pitch, unsigned int width, unsigned int height, cudaTextureAddressMode mode = cudaAddressModeBorder)
@@ -72,7 +72,7 @@ static bool bindProjDataTexture(float* data, unsigned int pitch, unsigned int wi
 	return true;
 }
 
-__global__ void devBP(float* D_volData, unsigned int volPitch, unsigned int startAngle, bool offsets, const SDimensions dims, float fOutputScale)
+__global__ void devBP(float* D_volData, unsigned int volPitch, unsigned int startAngle, const SDimensions dims, float fOutputScale)
 {
 	const int relX = threadIdx.x;
 	const int relY = threadIdx.y;
@@ -86,47 +86,30 @@ __global__ void devBP(float* D_volData, unsigned int volPitch, unsigned int star
 	if (X >= dims.iVolWidth || Y >= dims.iVolHeight)
 		return;
 
-	const float fX = ( X - 0.5f*dims.iVolWidth + 0.5f ) / dims.fDetScale;
-	const float fY = ( Y - 0.5f*dims.iVolHeight + 0.5f ) / dims.fDetScale;
+	const float fX = ( X - 0.5f*dims.iVolWidth + 0.5f );
+	const float fY = ( Y - 0.5f*dims.iVolHeight + 0.5f );
 
 	float* volData = (float*)D_volData;
 
 	float fVal = 0.0f;
 	float fA = startAngle + 0.5f;
-	const float fT_base = 0.5f*dims.iProjDets - 0.5f + 0.5f;
 
-	if (offsets) {
-
-		for (int angle = startAngle; angle < endAngle; ++angle)
-		{
-			const float cos_theta = gC_angle_cos[angle];
-			const float sin_theta = gC_angle_sin[angle];
-			const float TOffset = gC_angle_offset[angle];
-
-			const float fT = fT_base + fX * cos_theta - fY * sin_theta + TOffset;
-			fVal += tex2D(gT_projTexture, fT, fA);
-			fA += 1.0f;
-		}
-
-	} else {
-
-		for (int angle = startAngle; angle < endAngle; ++angle)
-		{
-			const float cos_theta = gC_angle_cos[angle];
-			const float sin_theta = gC_angle_sin[angle];
-
-			const float fT = fT_base + fX * cos_theta - fY * sin_theta;
-			fVal += tex2D(gT_projTexture, fT, fA);
-			fA += 1.0f;
-		}
+	for (int angle = startAngle; angle < endAngle; ++angle)
+	{
+		const float scaled_cos_theta = gC_angle_scaled_cos[angle];
+		const float scaled_sin_theta = gC_angle_scaled_sin[angle];
+		const float TOffset = gC_angle_offset[angle];
 
+		const float fT = fX * scaled_cos_theta - fY * scaled_sin_theta + TOffset;
+		fVal += tex2D(gT_projTexture, fT, fA);
+		fA += 1.0f;
 	}
 
 	volData[Y*volPitch+X] += fVal * fOutputScale;
 }
 
 // supersampling version
-__global__ void devBP_SS(float* D_volData, unsigned int volPitch, unsigned int startAngle, bool offsets, const SDimensions dims, float fOutputScale)
+__global__ void devBP_SS(float* D_volData, unsigned int volPitch, unsigned int startAngle, const SDimensions dims, float fOutputScale)
 {
 	const int relX = threadIdx.x;
 	const int relY = threadIdx.y;
@@ -140,61 +123,35 @@ __global__ void devBP_SS(float* D_volData, unsigned int volPitch, unsigned int s
 	if (X >= dims.iVolWidth || Y >= dims.iVolHeight)
 		return;
 
-	const float fX = ( X - 0.5f*dims.iVolWidth + 0.5f - 0.5f + 0.5f/dims.iRaysPerPixelDim) / dims.fDetScale;
-	const float fY = ( Y - 0.5f*dims.iVolHeight + 0.5f - 0.5f + 0.5f/dims.iRaysPerPixelDim) / dims.fDetScale;
+	const float fX = ( X - 0.5f*dims.iVolWidth + 0.5f - 0.5f + 0.5f/dims.iRaysPerPixelDim);
+	const float fY = ( Y - 0.5f*dims.iVolHeight + 0.5f - 0.5f + 0.5f/dims.iRaysPerPixelDim);
 
-	const float fSubStep = 1.0f/(dims.iRaysPerPixelDim * dims.fDetScale);
+	const float fSubStep = 1.0f/(dims.iRaysPerPixelDim); // * dims.fDetScale);
 
 	float* volData = (float*)D_volData;
 
 	float fVal = 0.0f;
 	float fA = startAngle + 0.5f;
-	const float fT_base = 0.5f*dims.iProjDets - 0.5f + 0.5f;
 
 	fOutputScale /= (dims.iRaysPerPixelDim * dims.iRaysPerPixelDim);
 
-	if (offsets) {
-
-		for (int angle = startAngle; angle < endAngle; ++angle)
-		{
-			const float cos_theta = gC_angle_cos[angle];
-			const float sin_theta = gC_angle_sin[angle];
-			const float TOffset = gC_angle_offset[angle];
-
-			float fT = fT_base + fX * cos_theta - fY * sin_theta + TOffset;
-
-			for (int iSubX = 0; iSubX < dims.iRaysPerPixelDim; ++iSubX) {
-				float fTy = fT;
-				fT += fSubStep * cos_theta;
-				for (int iSubY = 0; iSubY < dims.iRaysPerPixelDim; ++iSubY) {
-					fVal += tex2D(gT_projTexture, fTy, fA);
-					fTy -= fSubStep * sin_theta;
-				}
-			}
-			fA += 1.0f;
-		}
-
-	} else {
+	for (int angle = startAngle; angle < endAngle; ++angle)
+	{
+		const float cos_theta = gC_angle_scaled_cos[angle];
+		const float sin_theta = gC_angle_scaled_sin[angle];
+		const float TOffset = gC_angle_offset[angle];
 
-		for (int angle = startAngle; angle < endAngle; ++angle)
-		{
-			const float cos_theta = gC_angle_cos[angle];
-			const float sin_theta = gC_angle_sin[angle];
+		float fT = fX * cos_theta - fY * sin_theta + TOffset;
 
-			float fT = fT_base + fX * cos_theta - fY * sin_theta;
-
-			for (int iSubX = 0; iSubX < dims.iRaysPerPixelDim; ++iSubX) {
-				float fTy = fT;
-				fT += fSubStep * cos_theta;
-				for (int iSubY = 0; iSubY < dims.iRaysPerPixelDim; ++iSubY) {
-					fVal += tex2D(gT_projTexture, fTy, fA);
-					fTy -= fSubStep * sin_theta;
-				}
+		for (int iSubX = 0; iSubX < dims.iRaysPerPixelDim; ++iSubX) {
+			float fTy = fT;
+			fT += fSubStep * cos_theta;
+			for (int iSubY = 0; iSubY < dims.iRaysPerPixelDim; ++iSubY) {
+				fVal += tex2D(gT_projTexture, fTy, fA);
+				fTy -= fSubStep * sin_theta;
 			}
-			fA += 1.0f;
-
 		}
-
+		fA += 1.0f;
 	}
 
 	volData[Y*volPitch+X] += fVal * fOutputScale;
@@ -211,12 +168,10 @@ __global__ void devBP_SART(float* D_volData, unsigned int volPitch, float offset
 	if (X >= dims.iVolWidth || Y >= dims.iVolHeight)
 		return;
 
-	const float fX = ( X - 0.5f*dims.iVolWidth + 0.5f ) / dims.fDetScale;
-	const float fY = ( Y - 0.5f*dims.iVolHeight + 0.5f ) / dims.fDetScale;
-
-	const float fT_base = 0.5f*dims.iProjDets - 0.5f + 0.5f;
+	const float fX = ( X - 0.5f*dims.iVolWidth + 0.5f );
+	const float fY = ( Y - 0.5f*dims.iVolHeight + 0.5f );
 
-	const float fT = fT_base + fX * angle_cos - fY * angle_sin + offset;
+	const float fT = fX * angle_cos - fY * angle_sin + offset;
 	const float fVal = tex2D(gT_projTexture, fT, 0.5f);
 
 	D_volData[Y*volPitch+X] += fVal * fOutputScale;
@@ -225,32 +180,35 @@ __global__ void devBP_SART(float* D_volData, unsigned int volPitch, float offset
 
 bool BP_internal(float* D_volumeData, unsigned int volumePitch,
         float* D_projData, unsigned int projPitch,
-        const SDimensions& dims, const float* angles, const float* TOffsets, float fOutputScale)
+        const SDimensions& dims, const SParProjection* angles,
+        float fOutputScale)
 {
-	// TODO: process angles block by block
 	assert(dims.iProjAngles <= g_MaxAngles);
 
-	float* angle_sin = new float[dims.iProjAngles];
-	float* angle_cos = new float[dims.iProjAngles];
+	float* angle_scaled_sin = new float[dims.iProjAngles];
+	float* angle_scaled_cos = new float[dims.iProjAngles];
+	float* angle_offset = new float[dims.iProjAngles];
 
 	bindProjDataTexture(D_projData, projPitch, dims.iProjDets, dims.iProjAngles);
 
 	for (unsigned int i = 0; i < dims.iProjAngles; ++i) {
-		angle_sin[i] = sinf(angles[i]);
-		angle_cos[i] = cosf(angles[i]);
+		double d = angles[i].fDetUX * angles[i].fRayY - angles[i].fDetUY * angles[i].fRayX;
+		angle_scaled_cos[i] = angles[i].fRayY / d;
+		angle_scaled_sin[i] = -angles[i].fRayX / d; // TODO: Check signs
+		angle_offset[i] = (angles[i].fDetSY * angles[i].fRayX - angles[i].fDetSX * angles[i].fRayY) / d;
 	}
-	cudaError_t e1 = cudaMemcpyToSymbol(gC_angle_sin, angle_sin, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice);
-	cudaError_t e2 = cudaMemcpyToSymbol(gC_angle_cos, angle_cos, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice);
+
+	cudaError_t e1 = cudaMemcpyToSymbol(gC_angle_scaled_sin, angle_scaled_sin, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice);
+	cudaError_t e2 = cudaMemcpyToSymbol(gC_angle_scaled_cos, angle_scaled_cos, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice);
+	cudaError_t e3 = cudaMemcpyToSymbol(gC_angle_offset, angle_offset, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice);
 	assert(e1 == cudaSuccess);
 	assert(e2 == cudaSuccess);
+	assert(e3 == cudaSuccess);
 
-	if (TOffsets) {
-		cudaError_t e3 = cudaMemcpyToSymbol(gC_angle_offset, TOffsets, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice);
-		assert(e3 == cudaSuccess);
-	}
 
-	delete[] angle_sin;
-	delete[] angle_cos;
+	delete[] angle_scaled_sin;
+	delete[] angle_scaled_cos;
+	delete[] angle_offset;
 
 	dim3 dimBlock(g_blockSlices, g_blockSliceSize);
 	dim3 dimGrid((dims.iVolWidth+g_blockSlices-1)/g_blockSlices,
@@ -262,9 +220,9 @@ bool BP_internal(float* D_volumeData, unsigned int volumePitch,
 	for (unsigned int i = 0; i < dims.iProjAngles; i += g_anglesPerBlock) {
 
 		if (dims.iRaysPerPixelDim > 1)
-			devBP_SS<<<dimGrid, dimBlock, 0, stream>>>(D_volumeData, volumePitch, i, (TOffsets != 0), dims, fOutputScale);
+			devBP_SS<<<dimGrid, dimBlock, 0, stream>>>(D_volumeData, volumePitch, i, dims, fOutputScale);
 		else
-			devBP<<<dimGrid, dimBlock, 0, stream>>>(D_volumeData, volumePitch, i, (TOffsets != 0), dims, fOutputScale);
+			devBP<<<dimGrid, dimBlock, 0, stream>>>(D_volumeData, volumePitch, i, dims, fOutputScale);
 	}
 	cudaThreadSynchronize();
 
@@ -277,7 +235,7 @@ bool BP_internal(float* D_volumeData, unsigned int volumePitch,
 
 bool BP(float* D_volumeData, unsigned int volumePitch,
         float* D_projData, unsigned int projPitch,
-        const SDimensions& dims, const float* angles, const float* TOffsets, float fOutputScale)
+        const SDimensions& dims, const SParProjection* angles, float fOutputScale)
 {
 	for (unsigned int iAngle = 0; iAngle < dims.iProjAngles; iAngle += g_MaxAngles) {
 		SDimensions subdims = dims;
@@ -289,9 +247,7 @@ bool BP(float* D_volumeData, unsigned int volumePitch,
 		bool ret;
 		ret = BP_internal(D_volumeData, volumePitch,
 		                  D_projData + iAngle * projPitch, projPitch,
-		                  subdims, angles + iAngle,
-		                  TOffsets ? TOffsets + iAngle : 0,
-		                  fOutputScale);
+		                  subdims, angles + iAngle, fOutputScale);
 		if (!ret)
 			return false;
 	}
@@ -302,25 +258,23 @@ bool BP(float* D_volumeData, unsigned int volumePitch,
 bool BP_SART(float* D_volumeData, unsigned int volumePitch,
              float* D_projData, unsigned int projPitch,
              unsigned int angle, const SDimensions& dims,
-             const float* angles, const float* TOffsets, float fOutputScale)
+             const SParProjection* angles, float fOutputScale)
 {
 	// Only one angle.
 	// We need to Clamp to the border pixels instead of to zero, because
 	// SART weights with ray length.
 	bindProjDataTexture(D_projData, projPitch, dims.iProjDets, 1, cudaAddressModeClamp);
 
-	float angle_sin = sinf(angles[angle]);
-	float angle_cos = cosf(angles[angle]);
-
-	float offset = 0.0f;
-	if (TOffsets)
-		offset = TOffsets[angle];
+	double d = angles[angle].fDetUX * angles[angle].fRayY - angles[angle].fDetUY * angles[angle].fRayX;
+	float angle_scaled_cos = angles[angle].fRayY / d;
+	float angle_scaled_sin = -angles[angle].fRayX / d; // TODO: Check signs
+	float angle_offset = (angles[angle].fDetSY * angles[angle].fRayX - angles[angle].fDetSX * angles[angle].fRayY) / d;
 
 	dim3 dimBlock(g_blockSlices, g_blockSliceSize);
 	dim3 dimGrid((dims.iVolWidth+g_blockSlices-1)/g_blockSlices,
 	             (dims.iVolHeight+g_blockSliceSize-1)/g_blockSliceSize);
 
-	devBP_SART<<<dimGrid, dimBlock>>>(D_volumeData, volumePitch, offset, angle_sin, angle_cos, dims, fOutputScale);
+	devBP_SART<<<dimGrid, dimBlock>>>(D_volumeData, volumePitch, angle_offset, angle_scaled_sin, angle_scaled_cos, dims, fOutputScale);
 	cudaThreadSynchronize();
 
 	cudaTextForceKernelsCompletion();
diff --git a/cuda/2d/par_bp.h b/cuda/2d/par_bp.h
index 0720030..3fa8a28 100644
--- a/cuda/2d/par_bp.h
+++ b/cuda/2d/par_bp.h
@@ -34,13 +34,13 @@ namespace astraCUDA {
 
 _AstraExport bool BP(float* D_volumeData, unsigned int volumePitch,
         float* D_projData, unsigned int projPitch,
-        const SDimensions& dims, const float* angles,
-        const float* TOffsets, float fOutputScale);
+        const SDimensions& dims, const SParProjection* angles,
+        float fOutputScale);
 
 _AstraExport bool BP_SART(float* D_volumeData, unsigned int volumePitch,
              float* D_projData, unsigned int projPitch,
              unsigned int angle, const SDimensions& dims,
-             const float* angles, const float* TOffsets, float fOutputScale);
+             const SParProjection* angles, float fOutputScale);
 
 }
 
diff --git a/cuda/2d/par_fp.cu b/cuda/2d/par_fp.cu
index f58a643..a0b04ee 100644
--- a/cuda/2d/par_fp.cu
+++ b/cuda/2d/par_fp.cu
@@ -29,6 +29,7 @@ along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
 #include <cassert>
 #include <iostream>
 #include <list>
+#include <cmath>
 
 #include "util.h"
 #include "arith.h"
@@ -50,6 +51,7 @@ namespace astraCUDA {
 static const unsigned g_MaxAngles = 2560;
 __constant__ float gC_angle[g_MaxAngles];
 __constant__ float gC_angle_offset[g_MaxAngles];
+__constant__ float gC_angle_detsize[g_MaxAngles];
 
 
 // optimization parameters
@@ -62,10 +64,6 @@ static const unsigned int g_blockSlices = 64;
 #define iPREC_FACTOR 16
 
 
-// if necessary, a buffer of zeroes of size g_MaxAngles
-static float* g_pfZeroes = 0;
-
-
 static bool bindVolumeDataTexture(float* data, cudaArray*& dataArray, unsigned int pitch, unsigned int width, unsigned int height)
 {
 	cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
@@ -88,9 +86,10 @@ static bool bindVolumeDataTexture(float* data, cudaArray*& dataArray, unsigned i
 	return true;
 }
 
+
 // projection for angles that are roughly horizontal
-// theta between 45 and 135 degrees
-__global__ void FPhorizontal(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, int regionOffset, const SDimensions dims, float outputScale)
+// (detector roughly vertical)
+__global__ void FPhorizontal_simple(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions dims, float outputScale)
 {
 	const int relDet = threadIdx.x;
 	const int relAngle = threadIdx.y;
@@ -105,33 +104,7 @@ __global__ void FPhorizontal(float* D_projData, unsigned int projPitch, unsigned
 	const float sin_theta = __sinf(theta);
 
 	// compute start detector for this block/angle:
-	// (The same for all threadIdx.x)
-	// -------------------------------------
-
-	const int midSlice = startSlice + g_blockSlices / 2;
-
-	// ASSUMPTION: fDetScale >= 1.0f
-	// problem: detector regions get skipped because slice blocks aren't large
-	// enough
-	const unsigned int g_blockSliceSize = g_detBlockSize;
-
-	// project (midSlice,midRegion) on this thread's detector
-
-	const float fBase = 0.5f*dims.iProjDets - 0.5f +
-		(
-		    (midSlice - 0.5f*dims.iVolWidth + 0.5f) * cos_theta
-		  - (g_blockSliceSize/2 - 0.5f*dims.iVolHeight + 0.5f) * sin_theta
-		  + gC_angle_offset[angle]
-		) / dims.fDetScale;
-	int iBase = (int)floorf(fBase * fPREC_FACTOR);
-	int iInc = (int)floorf(g_blockSliceSize * sin_theta / dims.fDetScale * -fPREC_FACTOR);
-
-	// ASSUMPTION: 16 > regionOffset / fDetScale
-	const int detRegion = (iBase + (blockIdx.y - regionOffset)*iInc + 16*iPREC_FACTOR*g_detBlockSize) / (iPREC_FACTOR * g_detBlockSize) - 16;
-	const int detPrevRegion = (iBase + (blockIdx.y - regionOffset - 1)*iInc + 16*iPREC_FACTOR*g_detBlockSize) / (iPREC_FACTOR * g_detBlockSize) - 16;
-
-	if (blockIdx.y > 0 && detRegion == detPrevRegion)
-		return;
+	const int detRegion = blockIdx.y;
 
 	const int detector = detRegion * g_detBlockSize + relDet;
 
@@ -141,7 +114,7 @@ __global__ void FPhorizontal(float* D_projData, unsigned int projPitch, unsigned
 	if (detector < 0 || detector >= dims.iProjDets)
 		return;
 
-	const float fDetStep = -dims.fDetScale / sin_theta;
+	const float fDetStep = -gC_angle_detsize[angle] / sin_theta;
 	float fSliceStep = cos_theta / sin_theta;
 	float fDistCorr;
 	if (sin_theta > 0.0f)
@@ -191,9 +164,10 @@ __global__ void FPhorizontal(float* D_projData, unsigned int projPitch, unsigned
 	D_projData[angle*projPitch+detector] += fVal * fDistCorr;
 }
 
+
 // projection for angles that are roughly vertical
-// theta between 0 and 45, or 135 and 180 degrees
-__global__ void FPvertical(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, int regionOffset, const SDimensions dims, float outputScale)
+// (detector roughly horizontal)
+__global__ void FPvertical_simple(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions dims, float outputScale)
 {
 	const int relDet = threadIdx.x;
 	const int relAngle = threadIdx.y;
@@ -208,33 +182,7 @@ __global__ void FPvertical(float* D_projData, unsigned int projPitch, unsigned i
 	const float sin_theta = __sinf(theta);
 
 	// compute start detector for this block/angle:
-	// (The same for all threadIdx.x)
-	// -------------------------------------
-
-	const int midSlice = startSlice + g_blockSlices / 2;
-
-	// project (midSlice,midRegion) on this thread's detector
-
-	// ASSUMPTION: fDetScale >= 1.0f
-	// problem: detector regions get skipped because slice blocks aren't large
-	// enough
-	const unsigned int g_blockSliceSize = g_detBlockSize;
-
-	const float fBase = 0.5f*dims.iProjDets - 0.5f +
-		(
-		    (g_blockSliceSize/2 - 0.5f*dims.iVolWidth + 0.5f) * cos_theta
-		  - (midSlice - 0.5f*dims.iVolHeight + 0.5f) * sin_theta
-		  + gC_angle_offset[angle]
-		) / dims.fDetScale;
-	int iBase = (int)floorf(fBase * fPREC_FACTOR);
-	int iInc = (int)floorf(g_blockSliceSize * cos_theta / dims.fDetScale * fPREC_FACTOR);
-
-	// ASSUMPTION: 16 > regionOffset / fDetScale
-	const int detRegion = (iBase + (blockIdx.y - regionOffset)*iInc + 16*iPREC_FACTOR*g_detBlockSize) / (iPREC_FACTOR * g_detBlockSize) - 16;
-	const int detPrevRegion = (iBase + (blockIdx.y - regionOffset-1)*iInc + 16*iPREC_FACTOR*g_detBlockSize) / (iPREC_FACTOR * g_detBlockSize) - 16;
-
-	if (blockIdx.y > 0 && detRegion == detPrevRegion)
-		return;
+	const int detRegion = blockIdx.y;
 
 	const int detector = detRegion * g_detBlockSize + relDet;
 
@@ -244,7 +192,7 @@ __global__ void FPvertical(float* D_projData, unsigned int projPitch, unsigned i
 	if (detector < 0 || detector >= dims.iProjDets)
 		return;
 
-	const float fDetStep = dims.fDetScale / cos_theta;
+	const float fDetStep = gC_angle_detsize[angle] / cos_theta;
 	float fSliceStep = sin_theta / cos_theta;
 	float fDistCorr;
 	if (cos_theta < 0.0f)
@@ -292,183 +240,45 @@ __global__ void FPvertical(float* D_projData, unsigned int projPitch, unsigned i
 	D_projData[angle*projPitch+detector] += fVal * fDistCorr;
 }
 
-// projection for angles that are roughly horizontal
-// (detector roughly vertical)
-__global__ void FPhorizontal_simple(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions dims, float outputScale)
-{
-	const int relDet = threadIdx.x;
-	const int relAngle = threadIdx.y;
-
-	int angle = startAngle + blockIdx.x * g_anglesPerBlock + relAngle;
-
-	if (angle >= endAngle)
-		return;
 
-	const float theta = gC_angle[angle];
-	const float cos_theta = __cosf(theta);
-	const float sin_theta = __sinf(theta);
 
-	// compute start detector for this block/angle:
-	const int detRegion = blockIdx.y;
-
-	const int detector = detRegion * g_detBlockSize + relDet;
-
-	// Now project the part of the ray to angle,detector through
-	// slices startSlice to startSlice+g_blockSlices-1
 
-	if (detector < 0 || detector >= dims.iProjDets)
-		return;
+// Coordinates of center of detector pixel number t:
+// x = (t - 0.5*nDets + 0.5 - fOffset) * fSize * cos(fAngle)
+// y = - (t - 0.5*nDets + 0.5 - fOffset) * fSize * sin(fAngle)
 
-	const float fDetStep = -dims.fDetScale / sin_theta;
-	float fSliceStep = cos_theta / sin_theta;
-	float fDistCorr;
-	if (sin_theta > 0.0f)
-		fDistCorr = -fDetStep;
-	else
-		fDistCorr = fDetStep;
-	fDistCorr *= outputScale;
 
-	float fVal = 0.0f;
-	// project detector on slice
-	float fP = (detector - 0.5f*dims.iProjDets + 0.5f - gC_angle_offset[angle]) * fDetStep + (startSlice - 0.5f*dims.iVolWidth + 0.5f) * fSliceStep + 0.5f*dims.iVolHeight - 0.5f + 0.5f;
-	float fS = startSlice + 0.5f;
-	int endSlice = startSlice + g_blockSlices;
-	if (endSlice > dims.iVolWidth)
-		endSlice = dims.iVolWidth;
-
-	if (dims.iRaysPerDet > 1) {
-
-		fP += (-0.5f*dims.iRaysPerDet + 0.5f)/dims.iRaysPerDet * fDetStep;
-		const float fSubDetStep = fDetStep / dims.iRaysPerDet;
-		fDistCorr /= dims.iRaysPerDet;
-
-		fSliceStep -= dims.iRaysPerDet * fSubDetStep;
-
-		for (int slice = startSlice; slice < endSlice; ++slice)
-		{
-			for (int iSubT = 0; iSubT < dims.iRaysPerDet; ++iSubT) {
-				fVal += tex2D(gT_volumeTexture, fS, fP);
-				fP += fSubDetStep;
-			}
-			fP += fSliceStep;
-			fS += 1.0f;
-		}
-
-	} else {
-
-		for (int slice = startSlice; slice < endSlice; ++slice)
-		{
-			fVal += tex2D(gT_volumeTexture, fS, fP);
-			fP += fSliceStep;
-			fS += 1.0f;
-		}
-
-
-	}
-
-	D_projData[angle*projPitch+detector] += fVal * fDistCorr;
-}
-
-
-// projection for angles that are roughly vertical
-// (detector roughly horizontal)
-__global__ void FPvertical_simple(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions dims, float outputScale)
+static void convertAndUploadAngles(const SParProjection *projs, unsigned int nth, unsigned int ndets)
 {
-	const int relDet = threadIdx.x;
-	const int relAngle = threadIdx.y;
-
-	int angle = startAngle + blockIdx.x * g_anglesPerBlock + relAngle;
-
-	if (angle >= endAngle)
-		return;
-
-	const float theta = gC_angle[angle];
-	const float cos_theta = __cosf(theta);
-	const float sin_theta = __sinf(theta);
-
-	// compute start detector for this block/angle:
-	const int detRegion = blockIdx.y;
-
-	const int detector = detRegion * g_detBlockSize + relDet;
-
-	// Now project the part of the ray to angle,detector through
-	// slices startSlice to startSlice+g_blockSlices-1
-
-	if (detector < 0 || detector >= dims.iProjDets)
-		return;
-
-	const float fDetStep = dims.fDetScale / cos_theta;
-	float fSliceStep = sin_theta / cos_theta;
-	float fDistCorr;
-	if (cos_theta < 0.0f)
-		fDistCorr = -fDetStep;
-	else
-		fDistCorr = fDetStep;
-	fDistCorr *= outputScale;
-
-	float fVal = 0.0f;
-	float fP = (detector - 0.5f*dims.iProjDets + 0.5f - gC_angle_offset[angle]) * fDetStep + (startSlice - 0.5f*dims.iVolHeight + 0.5f) * fSliceStep + 0.5f*dims.iVolWidth - 0.5f + 0.5f;
-	float fS = startSlice+0.5f;
-	int endSlice = startSlice + g_blockSlices;
-	if (endSlice > dims.iVolHeight)
-		endSlice = dims.iVolHeight;
-
-	if (dims.iRaysPerDet > 1) {
-
-		fP += (-0.5f*dims.iRaysPerDet + 0.5f)/dims.iRaysPerDet * fDetStep;
-		const float fSubDetStep = fDetStep / dims.iRaysPerDet;
-		fDistCorr /= dims.iRaysPerDet;
-
-		fSliceStep -= dims.iRaysPerDet * fSubDetStep;
-
-		for (int slice = startSlice; slice < endSlice; ++slice)
-		{
-			for (int iSubT = 0; iSubT < dims.iRaysPerDet; ++iSubT) {
-				fVal += tex2D(gT_volumeTexture, fP, fS);
-				fP += fSubDetStep;
-			}
-			fP += fSliceStep;
-			fS += 1.0f;
-		}
-
-	} else {
+	float *angles = new float[nth];
+	float *offsets = new float[nth];
+	float *detsizes = new float[nth];
 
-		for (int slice = startSlice; slice < endSlice; ++slice)
-		{
-			fVal += tex2D(gT_volumeTexture, fP, fS);
-			fP += fSliceStep;
-			fS += 1.0f;
-		}
-
-	}
+	for (int i = 0; i < nth; ++i)
+		getParParameters(projs[i], ndets, angles[i], detsizes[i], offsets[i]);
 
-	D_projData[angle*projPitch+detector] += fVal * fDistCorr;
+	cudaMemcpyToSymbol(gC_angle, angles, nth*sizeof(float), 0, cudaMemcpyHostToDevice); 
+	cudaMemcpyToSymbol(gC_angle_offset, offsets, nth*sizeof(float), 0, cudaMemcpyHostToDevice);
+	cudaMemcpyToSymbol(gC_angle_detsize, detsizes, nth*sizeof(float), 0, cudaMemcpyHostToDevice); 
 }
 
 
 
 bool FP_simple_internal(float* D_volumeData, unsigned int volumePitch,
                float* D_projData, unsigned int projPitch,
-               const SDimensions& dims, const float* angles,
-               const float* TOffsets, float outputScale)
+               const SDimensions& dims, const SParProjection* angles,
+               float outputScale)
 {
-	// TODO: load angles into constant memory in smaller blocks
 	assert(dims.iProjAngles <= g_MaxAngles);
 
+	assert(angles);
+
 	cudaArray* D_dataArray;
 	bindVolumeDataTexture(D_volumeData, D_dataArray, volumePitch, dims.iVolWidth, dims.iVolHeight);
 
-	cudaMemcpyToSymbol(gC_angle, angles, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice);
 
-	if (TOffsets) {
-		cudaMemcpyToSymbol(gC_angle_offset, TOffsets, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice);
-	} else {
-		if (!g_pfZeroes) {
-			g_pfZeroes = new float[g_MaxAngles];
-			memset(g_pfZeroes, 0, g_MaxAngles * sizeof(float));
-		}
-		cudaMemcpyToSymbol(gC_angle_offset, g_pfZeroes, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice);
-	}
+	convertAndUploadAngles(angles, dims.iProjAngles, dims.iProjDets);
+
 
 	dim3 dimBlock(g_detBlockSize, g_anglesPerBlock); // detector block size, angles
 
@@ -491,7 +301,7 @@ bool FP_simple_internal(float* D_volumeData, unsigned int volumePitch,
 		// Maybe we should detect corner cases and put them in the optimal
 		// group of angles.
 		if (a != dims.iProjAngles)
-			vertical = (fabsf(sinf(angles[a])) <= fabsf(cosf(angles[a])));
+			vertical = (fabsf(angles[a].fRayX) <= fabsf(angles[a].fRayY));
 		if (a == dims.iProjAngles || vertical != blockVertical) {
 			// block done
 
@@ -535,8 +345,8 @@ bool FP_simple_internal(float* D_volumeData, unsigned int volumePitch,
 
 bool FP_simple(float* D_volumeData, unsigned int volumePitch,
                float* D_projData, unsigned int projPitch,
-               const SDimensions& dims, const float* angles,
-               const float* TOffsets, float outputScale)
+               const SDimensions& dims, const SParProjection* angles,
+               float outputScale)
 {
 	for (unsigned int iAngle = 0; iAngle < dims.iProjAngles; iAngle += g_MaxAngles) {
 		SDimensions subdims = dims;
@@ -549,7 +359,7 @@ bool FP_simple(float* D_volumeData, unsigned int volumePitch,
 		ret = FP_simple_internal(D_volumeData, volumePitch,
 		                         D_projData + iAngle * projPitch, projPitch,
 		                         subdims, angles + iAngle,
-		                         TOffsets ? TOffsets + iAngle : 0, outputScale);
+		                         outputScale);
 		if (!ret)
 			return false;
 	}
@@ -558,106 +368,12 @@ bool FP_simple(float* D_volumeData, unsigned int volumePitch,
 
 bool FP(float* D_volumeData, unsigned int volumePitch,
         float* D_projData, unsigned int projPitch,
-        const SDimensions& dims, const float* angles,
-        const float* TOffsets, float outputScale)
+        const SDimensions& dims, const SParProjection* angles,
+        float outputScale)
 {
 	return FP_simple(D_volumeData, volumePitch, D_projData, projPitch,
-	                 dims, angles, TOffsets, outputScale);
+	                 dims, angles, outputScale);
 
-	// TODO: Fix bug in this non-simple FP with large detscale and TOffsets
-#if 0
-
-	// TODO: load angles into constant memory in smaller blocks
-	assert(dims.iProjAngles <= g_MaxAngles);
-
-	// TODO: compute region size dynamically to resolve these two assumptions
-	// ASSUMPTION: 16 > regionOffset / fDetScale
-	const unsigned int g_blockSliceSize = g_detBlockSize;
-	assert(16 > (g_blockSlices / g_blockSliceSize) / dims.fDetScale);
-	// ASSUMPTION: fDetScale >= 1.0f
-	assert(dims.fDetScale > 0.9999f);
-
-	cudaArray* D_dataArray;
-	bindVolumeDataTexture(D_volumeData, D_dataArray, volumePitch, dims.iVolWidth, dims.iVolHeight);
-
-	cudaMemcpyToSymbol(gC_angle, angles, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice);
-
-	if (TOffsets) {
-		cudaMemcpyToSymbol(gC_angle_offset, TOffsets, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice);
-	} else {
-		if (!g_pfZeroes) {
-			g_pfZeroes = new float[g_MaxAngles];
-			memset(g_pfZeroes, 0, g_MaxAngles * sizeof(float));
-		}
-		cudaMemcpyToSymbol(gC_angle_offset, g_pfZeroes, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice);
-	}
-
-	int regionOffset = g_blockSlices / g_blockSliceSize;
-
-	dim3 dimBlock(g_detBlockSize, g_anglesPerBlock); // region size, angles
-
-	std::list<cudaStream_t> streams;
-
-
-	// Run over all angles, grouping them into groups of the same
-	// orientation (roughly horizontal vs. roughly vertical).
-	// Start a stream of grids for each such group.
-
-	// TODO: Check if it's worth it to store this info instead
-	// of recomputing it every FP.
-
-	unsigned int blockStart = 0;
-	unsigned int blockEnd = 0;
-	bool blockVertical = false;
-	for (unsigned int a = 0; a <= dims.iProjAngles; ++a) {
-		bool vertical;
-		// TODO: Having <= instead of < below causes a 5% speedup.
-		// Maybe we should detect corner cases and put them in the optimal
-		// group of angles.
-		if (a != dims.iProjAngles)
-			vertical = (fabsf(sinf(angles[a])) <= fabsf(cosf(angles[a])));
-		if (a == dims.iProjAngles || vertical != blockVertical) {
-			// block done
-
-			blockEnd = a;
-			if (blockStart != blockEnd) {
-				unsigned int length = dims.iVolHeight;
-				if (blockVertical)
-					length = dims.iVolWidth;
-				dim3 dimGrid((blockEnd-blockStart+g_anglesPerBlock-1)/g_anglesPerBlock,
-				             (length+g_blockSliceSize-1)/g_blockSliceSize+2*regionOffset); // angle blocks, regions
-				// TODO: check if we can't immediately
-				//       destroy the stream after use
-				cudaStream_t stream;
-				cudaStreamCreate(&stream);
-				streams.push_back(stream);
-				//printf("angle block: %d to %d, %d\n", blockStart, blockEnd, blockVertical);
-				if (!blockVertical)
-					for (unsigned int i = 0; i < dims.iVolWidth; i += g_blockSlices)
-						FPhorizontal<<<dimGrid, dimBlock, 0, stream>>>(D_projData, projPitch, i, blockStart, blockEnd, regionOffset, dims, outputScale);
-				else
-					for (unsigned int i = 0; i < dims.iVolHeight; i += g_blockSlices)
-						FPvertical<<<dimGrid, dimBlock, 0, stream>>>(D_projData, projPitch, i, blockStart, blockEnd, regionOffset, dims, outputScale);
-			}
-			blockVertical = vertical;
-			blockStart = a;
-		}
-	}
-
-	for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter)
-		cudaStreamDestroy(*iter);
-
-	streams.clear();
-
-	cudaThreadSynchronize();
-
-	cudaTextForceKernelsCompletion();
-
-	cudaFreeArray(D_dataArray);
-		
-
-	return true;
-#endif
 }
 
 
diff --git a/cuda/2d/par_fp.h b/cuda/2d/par_fp.h
index 4830c1e..1f0ff69 100644
--- a/cuda/2d/par_fp.h
+++ b/cuda/2d/par_fp.h
@@ -32,8 +32,8 @@ namespace astraCUDA {
 
 _AstraExport bool FP(float* D_volumeData, unsigned int volumePitch,
         float* D_projData, unsigned int projPitch,
-        const SDimensions& dims, const float* angles,
-        const float* TOffsets, float fOutputScale);
+        const SDimensions& dims, const SParProjection* angles,
+        float fOutputScale);
 
 }
 
diff --git a/cuda/2d/sart.cu b/cuda/2d/sart.cu
index f444b63..3a3c7e6 100644
--- a/cuda/2d/sart.cu
+++ b/cuda/2d/sart.cu
@@ -253,10 +253,10 @@ bool SART::callFP_SART(float* D_volumeData, unsigned int volumePitch,
 {
 	SDimensions d = dims;
 	d.iProjAngles = 1;
-	if (angles) {
+	if (parProjs) {
 		assert(!fanProjs);
 		return FP(D_volumeData, volumePitch, D_projData, projPitch,
-		          d, &angles[angle], TOffsets, outputScale);
+		          d, &parProjs[angle], outputScale);
 	} else {
 		assert(fanProjs);
 		return FanFP(D_volumeData, volumePitch, D_projData, projPitch,
@@ -268,10 +268,10 @@ bool SART::callBP_SART(float* D_volumeData, unsigned int volumePitch,
                        float* D_projData, unsigned int projPitch,
                        unsigned int angle, float outputScale)
 {
-	if (angles) {
+	if (parProjs) {
 		assert(!fanProjs);
 		return BP_SART(D_volumeData, volumePitch, D_projData, projPitch,
-		               angle, dims, angles, TOffsets, outputScale);
+		               angle, dims, parProjs, outputScale);
 	} else {
 		assert(fanProjs);
 		return FanBP_SART(D_volumeData, volumePitch, D_projData, projPitch,
diff --git a/cuda/2d/sirt.cu b/cuda/2d/sirt.cu
index 0079717..b393d7f 100644
--- a/cuda/2d/sirt.cu
+++ b/cuda/2d/sirt.cu
@@ -151,7 +151,7 @@ bool SIRT::precomputeWeights()
 bool SIRT::doSlabCorrections()
 {
 	// This function compensates for effectively infinitely large slab-like
-	// objects of finite thickness 1.
+	// objects of finite thickness 1 in a parallel beam geometry.
 
 	// Each ray through the object has an intersection of length d/cos(alpha).
 	// The length of the ray actually intersecting the reconstruction volume is
@@ -169,6 +169,10 @@ bool SIRT::doSlabCorrections()
 	if (useVolumeMask || useSinogramMask)
 		return false;
 
+	// Parallel-beam only
+	if (!parProjs)
+		return false;
+
 	// multiply by line weights
 	processSino<opDiv>(D_sinoData, D_lineWeight, projPitch, dims);
 
@@ -180,7 +184,9 @@ bool SIRT::doSlabCorrections()
 	float bound = cosf(1.3963f);
 	float* t = (float*)D_sinoData;
 	for (int i = 0; i < dims.iProjAngles; ++i) {
-		float f = fabs(cosf(angles[i]));
+		float angle, detsize, offset;
+		getParParameters(parProjs[i], dims.iProjDets, angle, detsize, offset);
+		float f = fabs(cosf(angle));
 
 		if (f < bound)
 			f = bound;
@@ -188,7 +194,6 @@ bool SIRT::doSlabCorrections()
 		processSino<opMul>(t, f, sinoPitch, subdims);
 		t += sinoPitch;
 	}
-
 	return true;
 }
 
@@ -298,40 +303,6 @@ float SIRT::computeDiffNorm()
 }
 
 
-bool doSIRT(float* D_volumeData, unsigned int volumePitch,
-            float* D_sinoData, unsigned int sinoPitch,
-            float* D_maskData, unsigned int maskPitch,
-            const SDimensions& dims, const float* angles,
-            const float* TOffsets, unsigned int iterations)
-{
-	SIRT sirt;
-	bool ok = true;
-
-	ok &= sirt.setGeometry(dims, angles);
-	if (D_maskData)
-		ok &= sirt.enableVolumeMask();
-	if (TOffsets)
-		ok &= sirt.setTOffsets(TOffsets);
-
-	if (!ok)
-		return false;
-
-	ok = sirt.init();
-	if (!ok)
-		return false;
-
-	if (D_maskData)
-		ok &= sirt.setVolumeMask(D_maskData, maskPitch);
-
-	ok &= sirt.setBuffers(D_volumeData, volumePitch, D_sinoData, sinoPitch);
-	if (!ok)
-		return false;
-
-	ok = sirt.iterate(iterations);
-
-	return ok;
-}
-
 }
 
 #ifdef STANDALONE