From 3c671c08a7b8e01b030523b568998035138ddda4 Mon Sep 17 00:00:00 2001 From: Matthias Vogelgesang Date: Tue, 23 Jun 2015 09:21:18 +0200 Subject: Add SSE-based decoding of v6 format Shaves off about 30% time. --- src/ufodecode.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 46 insertions(+), 2 deletions(-) (limited to 'src/ufodecode.c') diff --git a/src/ufodecode.c b/src/ufodecode.c index f86abfc..e00ff38 100644 --- a/src/ufodecode.c +++ b/src/ufodecode.c @@ -235,18 +235,61 @@ ufo_decode_frame_channels_v6 (UfoDecoder *decoder, uint16_t *pixel_buffer, uint3 size_t base = 0; size_t index = 0; const size_t space = 640; +#ifdef HAVE_SSE + const __m64 mask_fff = _mm_set_pi32 (0xfff, 0xfff); + __m64 mm_r, src1, src2, src3; + uint32_t *result; + + result = (uint32_t *) &mm_r; +#endif while (raw[base] != 0xAAAAAAA) { const size_t row_number = raw[base] & 0xfff; const size_t pixel_number = (raw[base + 1] >> 16) & 0xfff; base += 2; - index = row_number * IPECAMERA_WIDTH_20MP + pixel_number; +#ifdef HAVE_SSE + src1 = _mm_set_pi32 (raw[base], raw[base + 3]); + src2 = _mm_set_pi32 (raw[base + 1], raw[base + 4]); + src3 = _mm_set_pi32 (raw[base + 2], raw[base + 5]); + + mm_r = _mm_srli_pi32 (src1, 20); + pixel_buffer[index + 0 * space] = result[0]; + pixel_buffer[index + IPECAMERA_WIDTH_20MP + 0 * space] = result[1]; + + mm_r = _mm_and_si64 (_mm_srli_pi32 (src1, 8), mask_fff); + pixel_buffer[index + 1 * space] = result[0]; + pixel_buffer[index + IPECAMERA_WIDTH_20MP + 1 * space] = result[1]; + + mm_r = _mm_or_si64 (_mm_and_si64 (_mm_slli_pi32 (src1, 4), mask_fff), _mm_srli_pi32 (src2, 28)); + pixel_buffer[index + 2 * space] = result[0]; + pixel_buffer[index + IPECAMERA_WIDTH_20MP + 2 * space] = result[1]; + + mm_r = _mm_and_si64 (_mm_srli_pi32 (src2, 16), mask_fff); + pixel_buffer[index + 3 * space] = result[0]; + pixel_buffer[index + IPECAMERA_WIDTH_20MP + 3 * space] = result[1]; + + mm_r = _mm_and_si64 (_mm_srli_pi32 (src2, 4), mask_fff); + pixel_buffer[index + 4 * space] = result[0]; + pixel_buffer[index + IPECAMERA_WIDTH_20MP + 4 * space] = result[1]; + + mm_r = _mm_or_si64 (_mm_and_si64 (_mm_slli_pi32 (src2, 8), mask_fff), _mm_srli_pi32 (src3, 24)); + pixel_buffer[index + 5 * space] = result[0]; + pixel_buffer[index + IPECAMERA_WIDTH_20MP + 5 * space] = result[1]; + + mm_r = _mm_and_si64 (_mm_srli_pi32 (src3, 12), mask_fff); + pixel_buffer[index + 6 * space] = result[0]; + pixel_buffer[index + IPECAMERA_WIDTH_20MP + 6 * space] = result[1]; + + mm_r = _mm_and_si64 (src3, mask_fff); + pixel_buffer[index + 7 * space] = result[0]; + pixel_buffer[index + IPECAMERA_WIDTH_20MP + 7 * space] = result[1]; +#else pixel_buffer[index + 0 * space] = (raw[base] >> 20); pixel_buffer[index + 1 * space] = (raw[base] >> 8) & 0xfff; - pixel_buffer[index + 2 * space] = ((raw[base] << 4) & 0xff) | ((raw[base + 1] >> 28) & 0xff); + pixel_buffer[index + 2 * space] = ((raw[base] << 4) & 0xfff) | (raw[base + 1] >> 28); pixel_buffer[index + 3 * space] = (raw[base + 1] >> 16) & 0xfff; pixel_buffer[index + 4 * space] = (raw[base + 1] >> 4) & 0xfff; pixel_buffer[index + 5 * space] = ((raw[base + 1] << 8) & 0xfff) | (raw[base + 2] >> 24); @@ -262,6 +305,7 @@ ufo_decode_frame_channels_v6 (UfoDecoder *decoder, uint16_t *pixel_buffer, uint3 pixel_buffer[index + 5 * space] = ((raw[base + 4] << 8) & 0xfff) | (raw[base + 5] >> 24); pixel_buffer[index + 6 * space] = (raw[base + 5] >> 12) & 0xfff; pixel_buffer[index + 7 * space] = (raw[base + 5] & 0xfff); +#endif base += 6; } -- cgit v1.2.3